In [1]:
import time
import json
import datetime

import numpy as np

from runs_collector.dataset import RunsDataSet
from runs_analysis.resource_usage import get_tiers, calculate_costs
from optimization.optimization_heuristics import (compute_wasted_schedule1, 
                                                  compute_wasted_schedule2, 
                                                  failed_jobs_prioritization, 
                                                  timeout_value_optimization, 
                                                  is_commit_inbetween)

import warnings
warnings.filterwarnings('ignore')

## Load DataSet

In [2]:
start = time.time()

data_set = RunsDataSet(None, None, from_checkpoint=True, checkpoint_dir="./")

end = time.time()

print("Time taken to load the dataset:", round(end - start, 0), "seconds")

Loading dataset from checkpoints
Time taken to load the dataset: 67.0 seconds


In [3]:
all_runs = data_set.get_all_runs()
all_jobs = data_set.get_all_jobs()
all_jobs = data_set.get_all_jobs()
all_runs = data_set.get_all_runs()
jobs_runs_time = all_jobs.groupby("run_id").agg({"up_time": "sum", "start_ts": "min"}).reset_index()
runs_with_time = all_runs.merge(jobs_runs_time, left_on="id", right_on="run_id")
repos_list_1, repos_list_2 = get_tiers(data_set)
optimizations = {}

## Wasted schedule afeter K failures

In [4]:
wasted_schedule_paid = compute_wasted_schedule1(all_runs, all_jobs, repos_list_1)
wasted_schedule_free = compute_wasted_schedule1(all_runs, all_jobs, repos_list_2)

1423240940
0.14917972005498942
1423240940
0.0052491049055966585


In [5]:
optimizations["wasted_schedule"] = {
    "paid":{
        "all_runs":wasted_schedule_paid[0]*100,
        "subset_runs": wasted_schedule_paid[1]*100,
        "saved_time_all": wasted_schedule_paid[2]*100,
        "saved_subset": wasted_schedule_paid[3]*100,
        "saved_cost": wasted_schedule_paid[4]
    },
    "free":{
        "all_runs":wasted_schedule_free[0],
        "subset_runs": wasted_schedule_free[1]*100,
        "saved_time_all": wasted_schedule_free[2]*100,
        "saved_subset": wasted_schedule_free[3]*100,
        "saved_cost": wasted_schedule_free[4]
    }
}

In [6]:
optimizations

{'wasted_schedule': {'paid': {'all_runs': 4.51,
   'subset_runs': 17.23,
   'saved_time_all': 3.17,
   'saved_subset': 21.279999999999998,
   'saved_cost': 125.72},
  'free': {'all_runs': 0.004,
   'subset_runs': 1.0,
   'saved_time_all': 0.03,
   'saved_subset': 4.9,
   'saved_cost': 1.55}}}

## Wasted schedule during inactivity

In [7]:
all_repos = data_set.get_all_repositories()

In [8]:
# commits history
commits_dict = {}
with open("commits_messages_by_repo.json") as cmr:
    collected_messages = json.load(cmr)
    
for cm in collected_messages:
    if cm:
        repo_name = cm[0][0]
        if repo_name not in commits_dict:
            commits_dict[repo_name] = [x[1] for x in cm]
commits_dict_2 = {}
with open("scraped_commits_messages_part2.json") as cmr:
    collected_messages = json.load(cmr)
    
for cm in collected_messages:
    repo_name = cm[0]
    if repo_name in commits_dict_2:
        commits_dict_2[repo_name].append(cm[1])
    else:
        commits_dict_2[repo_name] = [cm[1]]

commits_dict_3 = {}

with open("collected_commits_messages_part3.json") as cmm:
    collected_messages = json.load(cmm)
    
for cm in collected_messages:
    repo_name = cm[0]
    if repo_name in commits_dict_3:
        commits_dict_3[repo_name].append(cm[1])
    else:
        commits_dict_3[repo_name] = [cm[1]]

commits_dict_3.update(commits_dict_2)
commits_dict_3.update(commits_dict)

In [9]:
all_runs_sub_1, total_waste_time, total_over_schedule, total_over_total, wasted_fails, saved_cost = compute_wasted_schedule2(
                                                                                all_runs, all_jobs, all_repos, commits_dict_3, repos_list_1)
all_runs_sub_2, total_waste_time2, total_over_schedule2, total_over_total2, wasted_fails2, saved_cost2 = compute_wasted_schedule2(
                                                                                all_runs, all_jobs, all_repos, commits_dict_3, repos_list_2)

1423240940
212318685
1423240940
7470741


In [10]:
optimizations["wasted_schedule_2"] = {
    "paid":{
        "all_runs":len(wasted_fails)/all_runs_sub_1.shape[0]*100,
        "subset_runs": len(wasted_fails)/all_runs_sub_1[all_runs_sub_1.event=="schedule"].shape[0]*100,
        "saved_time_all": total_over_total,
        "saved_subset": total_over_schedule,
        "saved_cost": saved_cost
    },
    "free":{
        "all_runs":len(wasted_fails2)/all_runs_sub_2.shape[0]*100,
        "subset_runs": len(wasted_fails2)/all_runs_sub_2[all_runs_sub_2.event=="schedule"].shape[0]*100,
        "saved_time_all": total_over_total2,
        "saved_subset": total_over_schedule2,
        "saved_cost": saved_cost2
    }
}

In [11]:
optimizations

{'wasted_schedule': {'paid': {'all_runs': 4.51,
   'subset_runs': 17.23,
   'saved_time_all': 3.17,
   'saved_subset': 21.279999999999998,
   'saved_cost': 125.72},
  'free': {'all_runs': 0.004,
   'subset_runs': 1.0,
   'saved_time_all': 0.03,
   'saved_subset': 4.9,
   'saved_cost': 1.55}},
 'wasted_schedule_2': {'paid': {'all_runs': 4.483177643393134,
   'subset_runs': 17.11288579226899,
   'saved_time_all': 0.0152,
   'saved_subset': 0.1016,
   'saved_cost': 99.78},
  'free': {'all_runs': 0.5579242728169694,
   'subset_runs': 1.3859651705406957,
   'saved_time_all': 0.0004,
   'saved_subset': 0.0753,
   'saved_cost': 3.81}}}

## Failed jobs

In [12]:
time_overall, time_over_impacted, impacted_runs, inlined_ids = failed_jobs_prioritization(data_set, repos_list_1)


In [13]:
impact_over_subset = all_runs[all_runs.id.isin(all_jobs[all_jobs.id.isin(inlined_ids)].run_id.to_list())].id.unique().shape[0]/all_runs_sub_1[all_runs_sub_1.conclusion=="failure"].shape[0]*100 + len(inlined_ids) / all_runs_sub_1[all_runs_sub_1.conclusion=="failure"].shape[0] *100

### Delta cost

In [14]:
sub_runs = all_runs[all_runs.id.isin(all_jobs[all_jobs.id.isin(inlined_ids)].run_id.to_list())]
min_max_start_ts = sub_runs.groupby("repo_id").start_ts.agg(["min", "max"]).reset_index()
total_start_ts = 0
for i, row in min_max_start_ts.iterrows():
    total_start_ts += row["max"] - row["min"]
years = total_start_ts/(12*30*24*3600)
delta_cost = calculate_costs(all_jobs[all_jobs.id.isin(inlined_ids)].up_time.sum() / 60 / years)

In [15]:
time_overall2, time_over_impacted2, impacted_runs2, inlined_ids2 = failed_jobs_prioritization(data_set, repos_list_2)
impact_over_subset2 = all_runs[all_runs.id.isin(all_jobs[all_jobs.id.isin(inlined_ids2)].run_id.to_list())].id.unique().shape[0]/all_runs_sub_2[all_runs_sub_2.conclusion=="failure"].shape[0]*100
sub_runs2 = all_runs[all_runs.id.isin(all_jobs[all_jobs.id.isin(inlined_ids2)].run_id.to_list())]
min_max_start_ts2 = sub_runs2.groupby("repo_id").start_ts.agg(["min", "max"]).reset_index()
total_start_ts2 = 0
for i, row in min_max_start_ts2.iterrows():
    total_start_ts2 += row["max"] - row["min"]
years2 = total_start_ts2/(12*30*24*3600)


In [16]:
delta_cost2 = calculate_costs(all_jobs[all_jobs.id.isin(inlined_ids2)].up_time.sum() / 60 / years2)

In [17]:
optimizations["failed_jobs"] = {
    "paid":{
        "all_runs": impacted_runs,
        "subset_runs": impact_over_subset,
        "saved_time_all": time_overall*100,
        "saved_subset": time_over_impacted*100,
        "saved_cost": delta_cost
    },
    "free":{
        "all_runs":impacted_runs2,
        "subset_runs": impact_over_subset2,
        "saved_time_all": time_overall2*100,
        "saved_subset": time_over_impacted2*100,
        "saved_cost": delta_cost2
    }
}

In [18]:
optimizations

{'wasted_schedule': {'paid': {'all_runs': 4.51,
   'subset_runs': 17.23,
   'saved_time_all': 3.17,
   'saved_subset': 21.279999999999998,
   'saved_cost': 125.72},
  'free': {'all_runs': 0.004,
   'subset_runs': 1.0,
   'saved_time_all': 0.03,
   'saved_subset': 4.9,
   'saved_cost': 1.55}},
 'wasted_schedule_2': {'paid': {'all_runs': 4.483177643393134,
   'subset_runs': 17.11288579226899,
   'saved_time_all': 0.0152,
   'saved_subset': 0.1016,
   'saved_cost': 99.78},
  'free': {'all_runs': 0.5579242728169694,
   'subset_runs': 1.3859651705406957,
   'saved_time_all': 0.0004,
   'saved_subset': 0.0753,
   'saved_cost': 3.81}},
 'failed_jobs': {'paid': {'all_runs': 1.0311084492222633,
   'subset_runs': 29.516355140186917,
   'saved_time_all': 108.02801246006877,
   'saved_subset': 3161.909848384515,
   'saved_cost': 17.89},
  'free': {'all_runs': 0.777775473257857,
   'subset_runs': 7.739938080495357,
   'saved_time_all': 2.7742807904331364,
   'saved_subset': 4527.272768966877,
   's

## Timeout value

In [19]:
sub_repos_list = repos_list_1
saved_time, impacted_runs = timeout_value_optimization(data_set, sub_repos_list)
all_runs = data_set.get_all_runs()
impacted_runs1 = len(impacted_runs) / all_runs[all_runs.repo_id.isin(sub_repos_list)].shape[0]*100
saved_time1 = sum([s for s in saved_time if not np.isnan(s)]) / all_jobs[all_jobs.run_id.isin(all_runs[all_runs.repo_id.isin(sub_repos_list)].id.to_list())].up_time.sum()*100
sub_runs = all_runs[all_runs.id.isin(impacted_runs)]
min_max_start_ts = sub_runs.groupby("repo_id").start_ts.agg(["min", "max"]).reset_index()
total_start_ts = 0
for i, row in min_max_start_ts.iterrows():
    total_start_ts += row["max"] - row["min"]
years = total_start_ts/(12*30*24*3600)
saved_cost1 = calculate_costs(sum([s for s in saved_time if not np.isnan(s)]) / 60 / years)

In [20]:
sub_repos_list = repos_list_2
saved_time, impacted_runs = timeout_value_optimization(data_set, sub_repos_list)
all_runs = data_set.get_all_runs()
impacted_runs2 = len(impacted_runs) / all_runs[all_runs.repo_id.isin(sub_repos_list)].shape[0]*100
saved_time2 = sum([s for s in saved_time if not np.isnan(s)]) / all_jobs[all_jobs.run_id.isin(all_runs[all_runs.repo_id.isin(sub_repos_list)].id.to_list())].up_time.sum()*100
sub_runs = all_runs[all_runs.id.isin(impacted_runs)]
min_max_start_ts = sub_runs.groupby("repo_id").start_ts.agg(["min", "max"]).reset_index()
total_start_ts = 0
for i, row in min_max_start_ts.iterrows():
    total_start_ts += row["max"] - row["min"]
years = total_start_ts/(12*30*24*3600)
saved_cost2 = calculate_costs(sum([s for s in saved_time if not np.isnan(s)]) / 60 / years)

In [21]:
optimizations["vm_timeout"] = {
    "paid":{
        "all_runs": impacted_runs1,
        "saved_time_all": saved_time1,
        "saved_cost": saved_cost1
    },
    "free":{
        "all_runs": impacted_runs2,
        "saved_time_all": saved_time2,
        "saved_cost": saved_cost2
    }
}

In [22]:
names_dict = {
    "wasted_schedule": "Deactivate after k failures",
    "wasted_schedule_2": "Deactivate during inactivity",
    "failed_jobs": "Run failed jobs first"
}

In [30]:
print("{:<40} {:<40} {:<40} {:<40}".format(
        "Optimization heuristic",
        "Impacted runs %",
        "Time saving %",
        "Annual cost delta $"
         ))
print("-"*40*4)
optim = "wasted_schedule"
o = optimizations[optim]
print("{:<40} {:<40} {:<40} {:<40}".format(
    "Deactivate scheduled workflows",
    str(round(o["paid"]["all_runs"], 1)) + "% (" + str(round(o["free"]["all_runs"], 1)) + "%) of all runs",
    str(round(o["paid"]["saved_time_all"], 1)) + "% (" + str(round(o["free"]["saved_time_all"], 1)) + "%) of all runs time",
    "-" + str(round(o["paid"]["saved_cost"], 2)) + " (-" + str(round(o["free"]["saved_cost"], 2)) + ")"
        ))
print("{:<40} {:<40} {:<40} {:<40}".format(
    "after k consecutive failures (k=3)",
    str(round(o["paid"]["subset_runs"], 1)) + "% (" + str(round(o["free"]["subset_runs"], 1)) + "%) of scheduled runs",
    str(round(o["paid"]["saved_subset"], 1)) + "% (" + str(round(o["free"]["saved_subset"], 1)) + "%) of scheduled runs",
    ""))
print("-"*40*4)

optim = "wasted_schedule_2"
o = optimizations[optim]
print("{:<40} {:<40} {:<40} {:<40}".format(
    "Deactivate scheduled workflows",
    str(round(o["paid"]["all_runs"], 1)) + "% (" + str(round(o["free"]["all_runs"], 1)) + "%) of all runs",
    str(round(o["paid"]["saved_time_all"], 1)) + "% (" + str(round(o["free"]["saved_time_all"], 1)) + "%) of all runs time",
    "-" + str(round(o["paid"]["saved_cost"], 2)) + " (-" + str(round(o["free"]["saved_cost"], 2)) + ")"
        ))  
print("{:<40} {:<40} {:<40} {:<40}".format(
    "during repository inactivity",
    str(round(o["paid"]["subset_runs"], 1)) + "% (" + str(round(o["free"]["subset_runs"], 1)) + "%) of scheduled runs",
    str(round(o["paid"]["saved_subset"], 1)) + "% (" + str(round(o["free"]["saved_subset"], 1)) + "%) of scheduled runs",
    ""))
print("-"*40*4)

optim = "failed_jobs"
o = optimizations[optim]
print("{:<40} {:<40} {:<40} {:<40}".format(
    "Run previously failed jobs",
    str(round(o["paid"]["all_runs"], 1)) + "% (" + str(round(o["free"]["all_runs"], 1)) + "%) of all runs",
    str(round(o["paid"]["saved_time_all"]/100, 1)) + "% (" + str(round(o["free"]["saved_time_all"]/100, 1)) + "%) of all runs time",
    "-" + str(round(o["paid"]["saved_cost"], 2)) + " (-" + str(round(o["free"]["saved_cost"], 2)) + ")"
        ))
print("{:<40} {:<40} {:<40} {:<40}".format(
    "first",
    str(round(o["paid"]["subset_runs"], 1)) + "% (" + str(round(o["free"]["subset_runs"], 1)) + "%) of failed runs",
    str(round(o["paid"]["saved_subset"]/100, 1)) + "% (" + str(round(o["free"]["saved_subset"]/100, 1)) + "%) of failed runs",
    ""))
print("-"*40*4)

optim = "vm_timeout"
o = optimizations[optim]
print("{:<40} {:<40} {:<40} {:<40}".format(
    "Project-specific timeouts",
    str(round(o["paid"]["all_runs"], 1)) + "% (" + str(round(o["free"]["all_runs"], 1)) + "%) of all runs",
    str(round(o["paid"]["saved_time_all"], 1)) + "% (" + str(round(o["free"]["saved_time_all"], 1)) + "%) of all runs time",
    "-" + str(round(o["paid"]["saved_cost"], 2)) + " (-" + str(round(o["free"]["saved_cost"], 2)) + ")"
        ))
print("-"*40*4)

Optimization heuristic                   Impacted runs %                          Time saving %                            Annual cost delta $                     
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Deactivate scheduled workflows           4.5% (0.0%) of all runs                  3.2% (0.0%) of all runs time             -125.72 (-1.55)                         
after k consecutive failures (k=3)       17.2% (1.0%) of scheduled runs           21.3% (4.9%) of scheduled runs                                                   
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Deactivate scheduled workflows           4.5% (0.6%) of all runs                  0.0% (0.0%) of all runs time             -99.78 (-3.81)                          
during repository inac