In [1]:
import time
import json
import os
import datetime

import pandas as pd
import numpy as np

from runs_collector.dataset import RunsDataSet
from runs_analysis.resource_usage import get_tiers
from optimization.optimization_heuristics import get_wasted_schedule_1, get_wasted_schedule_2, failed_jobs_prioritization, timeout_value_optimization

import warnings
warnings.filterwarnings('ignore')

In [3]:
start = time.time()

data_set = RunsDataSet(None, None, from_checkpoint=True, checkpoint_dir="./")

end = time.time()

print("Time taken to load the dataset:", round(end - start, 0), "seconds")

Loading dataset from checkpoints
Time taken to load the dataset: 78.0 seconds


In [4]:
all_runs = data_set.get_all_runs()

all_runs.repo_id.unique().shape[0]

952

In [6]:
all_jobs = data_set.get_all_jobs()
all_jobs = data_set.get_all_jobs()
all_runs = data_set.get_all_runs()
jobs_runs_time = all_jobs.groupby("run_id").agg({"up_time": "sum", "start_ts": "min"}).reset_index()
runs_with_time = all_runs.merge(jobs_runs_time, left_on="id", right_on="run_id")
repos_list_1, repos_list_2 = get_tiers(data_set)
optimizations = {}

## Wasted schedule afeter K

In [7]:
all_runs_sub_1 = all_runs[all_runs.repo_id.isin(repos_list_1)]
total_waste, waste_over_total_schedule, total_over_total, impacted_runs_schedule, impacted_runs_all, opt_repos = get_wasted_schedule_1(all_runs_sub_1, all_jobs, 3)

1423240940
0.14917972005498942


### Impacted runs

In [8]:
print("Among all runs:", impacted_runs_all)
print("Among scheduled:", impacted_runs_schedule)

Among all runs: 0.0451
Among scheduled: 0.1723


### Saved time

In [9]:
print("Saved time over all runs:", total_over_total)
print("Saved time over all schedule:", waste_over_total_schedule)

Saved time over all runs: 0.0317
Saved time over all schedule: 0.2128


### Saved cost

In [10]:
all_runs["start_ts"] = all_runs.created_at.apply(lambda x: int(time.mktime(datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").timetuple())))
sub_runs = all_runs[all_runs.repo_id.isin(opt_repos)]
min_max_start_ts = sub_runs.groupby("repo_id").start_ts.agg(["min", "max"]).reset_index()
total_start_ts = 0
for i, row in min_max_start_ts.iterrows():
    total_start_ts += row["max"] - row["min"]
years = total_start_ts/(12*30*24*3600)
saved_cost = total_waste / 60 / years * 0.008 * 1.52
print("saved cost yearly per repo:", saved_cost)

saved cost yearly per repo: 125.71513276249759


In [11]:
all_runs_sub_2 = all_runs[all_runs.repo_id.isin(repos_list_2)]
total_waste2, waste_over_total_schedule2, total_over_total2, impacted_runs_schedule2, impacted_runs_all2, opt_repos2 = get_wasted_schedule_1(
    all_runs_sub_2, all_jobs, 3)

1423240940
0.0052491049055966585


In [12]:
all_runs["start_ts"] = all_runs.created_at.apply(lambda x: int(time.mktime(datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").timetuple())))
sub_runs = all_runs[all_runs.repo_id.isin(opt_repos2)]
min_max_start_ts = sub_runs.groupby("repo_id").start_ts.agg(["min", "max"]).reset_index()
total_start_ts = 0
for i, row in min_max_start_ts.iterrows():
    total_start_ts += row["max"] - row["min"]
years = total_start_ts/(12*30*24*3600)
saved_cost2 = total_waste2 / 60 / years * 0.008 * 1.52
print("saved cost yearly per repo:", saved_cost2)

saved cost yearly per repo: 1.5510287232659694


In [13]:
optimizations["wasted_schedule"] = {
    "paid":{
        "all_runs":impacted_runs_all,
        "subset_runs": impacted_runs_schedule,
        "saved_time_all": total_over_total,
        "saved_subset": waste_over_total_schedule,
        "saved_cost": saved_cost
    },
    "free":{
        "all_runs":impacted_runs_all2,
        "subset_runs": impacted_runs_schedule2,
        "saved_time_all": total_over_total2,
        "saved_subset": waste_over_total_schedule2,
        "saved_cost": saved_cost2
    }
}

## Wasted schedule during inactivity

In [15]:
all_repos = data_set.get_all_repositories()

In [16]:
def is_commit_inbetween(ts, previous_ts, repo_name, commits):
    if repo_name in commits:
        for cm_ts in commits[repo_name]:
            if cm_ts < ts and previous_ts < cm_ts:
                return True
        return False
    else:
        return False 


In [17]:
commits_dict = {}
with open("commits_messages_by_repo.json") as cmr:
    collected_messages = json.load(cmr)
    
for cm in collected_messages:
    if cm:
        repo_name = cm[0][0]
        if repo_name not in commits_dict:
            commits_dict[repo_name] = [x[1] for x in cm]

In [18]:
commits_dict_2 = {}
with open("scraped_commits_messages_part2.json") as cmr:
    collected_messages = json.load(cmr)
    
for cm in collected_messages:
    repo_name = cm[0]
    if repo_name in commits_dict_2:
        commits_dict_2[repo_name].append(cm[1])
    else:
        commits_dict_2[repo_name] = [cm[1]]

In [19]:
commits_dict_3 = {}

with open("collected_commits_messages_part3.json") as cmm:
    collected_messages = json.load(cmm)
    
for cm in collected_messages:
    repo_name = cm[0]
    if repo_name in commits_dict_3:
        commits_dict_3[repo_name].append(cm[1])
    else:
        commits_dict_3[repo_name] = [cm[1]]

In [20]:
commits_dict_3.update(commits_dict_2)
commits_dict_3.update(commits_dict)

In [21]:
all_runs_sub_1 = all_runs[all_runs.repo_id.isin(repos_list_1)]
total_waste_time, total_over_schedule, total_over_total, wasted_fails = get_wasted_schedule_2(all_runs_sub_1, all_jobs, all_repos, commits_dict_3)

1423240940
212318685


### Impacted runs

In [22]:
print("Over all runs:", len(wasted_fails)/all_runs_sub_1.shape[0]*100)
print("Over all scheduled:", len(wasted_fails)/all_runs_sub_1[all_runs_sub_1.event=="schedule"].shape[0]*100)

Over all runs: 4.199503136716852
Over all scheduled: 16.03006244216518


### Saved time

In [23]:
print("Saved time over all runs:", total_over_total)
print("Saved time over all schedule:", total_over_schedule)

Saved time over all runs: 0.0016
Saved time over all schedule: 0.0107


### Cost delta

In [24]:
sub_runs = all_runs[all_runs.id.isin(wasted_fails)]
min_max_start_ts = sub_runs.groupby("repo_id").start_ts.agg(["min", "max"]).reset_index()
total_start_ts = 0
for i, row in min_max_start_ts.iterrows():
    total_start_ts += row["max"] - row["min"]
years = total_start_ts/(12*30*24*3600)
saved_cost = total_waste_time / 60 / years * 0.008 * 1.52

In [25]:
all_runs_sub_2 = all_runs[all_runs.repo_id.isin(repos_list_2)]
total_waste_time2, total_over_schedule2, total_over_total2, wasted_fails2 = get_wasted_schedule_2(all_runs_sub_2, all_jobs, all_repos, commits_dict_3)

1423240940
7470741


In [26]:
sub_runs2 = all_runs[all_runs.id.isin(wasted_fails2)]
min_max_start_ts2 = sub_runs2.groupby("repo_id").start_ts.agg(["min", "max"]).reset_index()
total_start_ts2 = 0
for i, row in min_max_start_ts2.iterrows():
    total_start_ts2 += row["max"] - row["min"]
years = total_start_ts2/(12*30*24*3600)
saved_cost2 = total_waste_time2 / 60 / years * 0.008 * 1.52

In [27]:
optimizations["wasted_schedule_2"] = {
    "paid":{
        "all_runs":len(wasted_fails)/all_runs_sub_1.shape[0]*100,
        "subset_runs": len(wasted_fails)/all_runs_sub_1[all_runs_sub_1.event=="schedule"].shape[0]*100,
        "saved_time_all": total_over_total,
        "saved_subset": waste_over_total_schedule,
        "saved_cost": saved_cost
    },
    "free":{
        "all_runs":len(wasted_fails2)/all_runs_sub_2.shape[0]*100,
        "subset_runs": len(wasted_fails2)/all_runs_sub_2[all_runs_sub_2.event=="schedule"].shape[0]*100,
        "saved_time_all": total_over_total2,
        "saved_subset": waste_over_total_schedule2,
        "saved_cost": saved_cost2
    }
}

## Failed jobs

In [29]:
time_overall, time_over_impacted, impacted_runs, inlined_ids = failed_jobs_prioritization(data_set, repos_list_1)

### Saved time

In [30]:
print("Time over all:", time_overall)
print("Time over impacted:", time_over_impacted)

Time over all: 1.0826092453467506
Time over impacted: 31.818142113442928


### Impacted runs

In [31]:
impacted_runs

1.0299880113143856

In [32]:
impact_over_subset = all_runs[all_runs.id.isin(all_jobs[all_jobs.id.isin(inlined_ids)].run_id.to_list())].id.unique().shape[0]/all_runs_sub_1[all_runs_sub_1.conclusion=="failure"].shape[0]*100 + len(inlined_ids) / all_runs_sub_1[all_runs_sub_1.conclusion=="failure"].shape[0] *100

### Delta cost

In [33]:
sub_runs = all_runs[all_runs.id.isin(all_jobs[all_jobs.id.isin(inlined_ids)].run_id.to_list())]
min_max_start_ts = sub_runs.groupby("repo_id").start_ts.agg(["min", "max"]).reset_index()
total_start_ts = 0
for i, row in min_max_start_ts.iterrows():
    total_start_ts += row["max"] - row["min"]
years = total_start_ts/(12*30*24*3600)
delta_cost = all_jobs[all_jobs.id.isin(inlined_ids)].up_time.sum() / 60 / years * 0.008 * 1.52

In [34]:
time_overall2, time_over_impacted2, impacted_runs2, inlined_ids2 = failed_jobs_prioritization(data_set, repos_list_2)
impact_over_subset2 = all_runs[all_runs.id.isin(all_jobs[all_jobs.id.isin(inlined_ids2)].run_id.to_list())].id.unique().shape[0]/all_runs_sub_2[all_runs_sub_2.conclusion=="failure"].shape[0]*100
sub_runs2 = all_runs[all_runs.id.isin(all_jobs[all_jobs.id.isin(inlined_ids2)].run_id.to_list())]
min_max_start_ts2 = sub_runs2.groupby("repo_id").start_ts.agg(["min", "max"]).reset_index()
total_start_ts2 = 0
for i, row in min_max_start_ts2.iterrows():
    total_start_ts2 += row["max"] - row["min"]
years2 = total_start_ts2/(12*30*24*3600)


In [35]:
delta_cost2 = all_jobs[all_jobs.id.isin(inlined_ids2)].up_time.sum() / 60 / years2 * 0.008 * 1.52

In [36]:
optimizations["failed_jobs"] = {
    "paid":{
        "all_runs": impacted_runs,
        "subset_runs": impact_over_subset,
        "saved_time_all": time_overall,
        "saved_subset": time_over_impacted,
        "saved_cost": delta_cost
    },
    "free":{
        "all_runs":impacted_runs2,
        "subset_runs": impact_over_subset2,
        "saved_time_all": time_overall2,
        "saved_subset": time_over_impacted2,
        "saved_cost": delta_cost2
    }
}

## Timeout value

In [63]:
def timeout_value_optimization(data_set, repos_list):
    all_runs = data_set.get_all_runs()
    all_runs = all_runs[all_runs.repo_id.isin(repos_list)]
    all_jobs = data_set.get_all_jobs()
    runs_total_time = all_jobs.groupby("run_id").agg({"up_time": "sum"}).reset_index()
    all_runs_time = all_runs.merge(runs_total_time, left_on="id", right_on="run_id")
    all_runs_time["start_ts"] = all_runs_time.created_at.apply(lambda x: int(time.mktime(datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").timetuple())))
    saved_time = []
    total_impact_time = []
    impacted_runs = []
    window = 10
    fraction = 0.1
    workflows_ids = all_runs.workflow_id.unique().tolist()
    for workflow_id in workflows_ids:
        workflow_runs = all_runs_time[all_runs_time.workflow_id==workflow_id].sort_values("run_number")
        if workflow_runs.shape[0]==0:
            continue
        else:
            workflow_jobs = all_jobs[all_jobs.run_id.isin(workflow_runs.id)]
            jobs_names = workflow_jobs.name.unique().tolist()
            for job_name in jobs_names:
                jobs = workflow_jobs[workflow_jobs.name==job_name]
                max_time = jobs[jobs.up_time<=21541].up_time.max()
                target_jobs = jobs[jobs.up_time>21541]
                if target_jobs.shape[0]!=0:
                    saved_time.append(target_jobs.up_time.sum() - target_jobs.shape[0] * max_time*1.1)
                    impacted_runs.extend(target_jobs.run_id.to_list())
                    
    return saved_time, impacted_runs

In [77]:
sub_repos_list = repos_list_1
saved_time, impacted_runs = timeout_value_optimization(data_set, sub_repos_list)
all_runs = data_set.get_all_runs()
impacted_runs1 = len(impacted_runs) / all_runs[all_runs.repo_id.isin(sub_repos_list)].shape[0]*100
saved_time1 = sum([s for s in saved_time if not np.isnan(s)]) / all_jobs[all_jobs.run_id.isin(all_runs[all_runs.repo_id.isin(sub_repos_list)].id.to_list())].up_time.sum()*100
sub_runs = all_runs[all_runs.id.isin(impacted_runs)]
min_max_start_ts = sub_runs.groupby("repo_id").start_ts.agg(["min", "max"]).reset_index()
total_start_ts = 0
for i, row in min_max_start_ts.iterrows():
    total_start_ts += row["max"] - row["min"]
years = total_start_ts/(12*30*24*3600)
saved_cost1 = sum([s for s in saved_time if not np.isnan(s)]) / 60 / years * 0.008 * 1.52

In [78]:
sub_repos_list = repos_list_2
saved_time, impacted_runs = timeout_value_optimization(data_set, sub_repos_list)
all_runs = data_set.get_all_runs()
impacted_runs2 = len(impacted_runs) / all_runs[all_runs.repo_id.isin(sub_repos_list)].shape[0]*100
saved_time2 = sum([s for s in saved_time if not np.isnan(s)]) / all_jobs[all_jobs.run_id.isin(all_runs[all_runs.repo_id.isin(sub_repos_list)].id.to_list())].up_time.sum()*100
sub_runs = all_runs[all_runs.id.isin(impacted_runs)]
min_max_start_ts = sub_runs.groupby("repo_id").start_ts.agg(["min", "max"]).reset_index()
total_start_ts = 0
for i, row in min_max_start_ts.iterrows():
    total_start_ts += row["max"] - row["min"]
years = total_start_ts/(12*30*24*3600)
saved_cost2 = sum([s for s in saved_time if not np.isnan(s)]) / 60 / years * 0.008 * 1.52

In [79]:
optimizations["vm_timeout"] = {
    "paid":{
        "all_runs": impacted_runs1,
        "saved_time_all": saved_time1,
        "saved_cost": saved_cost1
    },
    "free":{
        "all_runs": impacted_runs2,
        "saved_time_all": saved_time2,
        "saved_cost": saved_cost2
    }
}

In [7]:
names_dict = {
    "wasted_schedule": "Deactivate after k failures",
    "wasted_schedule_2": "Deactivate during inactivity",
    "failed_jobs": "Run failed jobs first"
}

In [37]:
print("{:<40} {:<40} {:<40} {:<40}".format(
        "Optimization heuristic",
        "Impacted runs %",
        "Time saving %",
        "Annual cost delta $"
         ))
print("-"*40*4)
optim = "wasted_schedule"
o = optimizations[optim]
print("{:<40} {:<40} {:<40} {:<40}".format(
    "Deactivate scheduled workflows",
    str(round(o["paid"]["all_runs"]*100, 1)) + "% (" + str(round(o["free"]["all_runs"]*100, 1)) + "%) of all runs",
    str(round(o["paid"]["saved_time_all"]*100, 1)) + "% (" + str(round(o["free"]["saved_time_all"]*100, 1)) + "%) of all runs time",
    "-" + str(round(o["paid"]["saved_cost"], 2)) + " (-" + str(round(o["free"]["saved_cost"], 2)) + ")"
        ))
print("{:<40} {:<40} {:<40} {:<40}".format(
    "after k consecutive failures (k=3)",
    str(round(o["paid"]["subset_runs"]*100, 1)) + "% (" + str(round(o["free"]["subset_runs"]*100, 1)) + "%) of scheduled runs",
    str(round(o["paid"]["saved_subset"]*100, 1)) + "% (" + str(round(o["free"]["saved_subset"]*100, 1)) + "%) of scheduled runs",
    ""))
print("-"*40*4)

optim = "wasted_schedule_2"
o = optimizations[optim]
print("{:<40} {:<40} {:<40} {:<40}".format(
    "Deactivate scheduled workflows",
    str(round(o["paid"]["all_runs"], 1)) + "% (" + str(round(o["free"]["all_runs"], 1)) + "%) of all runs",
    str(round(o["paid"]["saved_time_all"], 1)) + "% (" + str(round(o["free"]["saved_time_all"], 1)) + "%) of all runs time",
    "-" + str(round(o["paid"]["saved_cost"], 2)) + " (-" + str(round(o["free"]["saved_cost"], 2)) + ")"
        ))
print("{:<40} {:<40} {:<40} {:<40}".format(
    "during repository inactivity",
    str(round(o["paid"]["subset_runs"], 1)) + "% (" + str(round(o["free"]["subset_runs"], 1)) + "%) of scheduled runs",
    str(round(o["paid"]["saved_subset"], 1)) + "% (" + str(round(o["free"]["saved_subset"], 1)) + "%) of scheduled runs",
    ""))
print("-"*40*4)

optim = "failed_jobs"
o = optimizations[optim]
print("{:<40} {:<40} {:<40} {:<40}".format(
    "Run previously failed jobs",
    str(round(o["paid"]["all_runs"], 1)) + "% (" + str(round(o["free"]["all_runs"], 1)) + "%) of all runs",
    str(round(o["paid"]["saved_time_all"], 1)) + "% (" + str(round(o["free"]["saved_time_all"], 1)) + "%) of all runs time",
    "-" + str(round(o["paid"]["saved_cost"], 2)) + " (-" + str(round(o["free"]["saved_cost"], 2)) + ")"
        ))
print("{:<40} {:<40} {:<40} {:<40}".format(
    "first",
    str(round(o["paid"]["subset_runs"], 1)) + "% (" + str(round(o["free"]["subset_runs"], 1)) + "%) of failed runs",
    str(round(o["paid"]["saved_subset"], 1)) + "% (" + str(round(o["free"]["saved_subset"], 1)) + "%) of failed runs",
    ""))
print("-"*40*4)

optim = "vm_timeout"
o = optimizations[optim]
print("{:<40} {:<40} {:<40} {:<40}".format(
    "Project-specific timeouts",
    str(round(o["paid"]["all_runs"], 1)) + "% (" + str(round(o["free"]["all_runs"], 1)) + "%) of all runs",
    str(round(o["paid"]["saved_time_all"], 1)) + "% (" + str(round(o["free"]["saved_time_all"], 1)) + "%) of all runs time",
    "-" + str(round(o["paid"]["saved_cost"], 2)) + " (-" + str(round(o["free"]["saved_cost"], 2)) + ")"
        ))
print("-"*40*4)

Optimization heuristic                   Impacted runs %                          Time saving %                            Annual cost delta $                     
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Deactivate scheduled workflows           4.5% (0.4%) of all runs                  3.2% (0.0%) of all runs time             -125.72 (-1.55)                         
after k consecutive failures (k=3)       17.2% (1.0%) of scheduled runs           21.3% (4.9%) of scheduled runs                                                   
----------------------------------------------------------------------------------------------------------------------------------------------------------------
Deactivate scheduled workflows           4.2% (0.2%) of all runs                  0.0% (0.0%) of all runs time             -68.97 (-7.97)                          
during repository inac