In [1]:
import time
import json
import os
import datetime
import numpy as np
from runs_collector.dataset import RunsDataSet
from runs_analysis.resource_usage import get_tiers
from optimization.github_optimization import (get_optimization_usage, 
                                              get_optimization_ts, 
                                              cancel_in_progress_impact, 
                                              calc_skip_impact,
                                              calc_cache_impact,
                                              get_cache_ts,
                                              calc_cancel_in_progress_impact,
                                              get_optimization_usage_avm,
                                              get_optimization_ts_avm)
import warnings
warnings.filterwarnings('ignore')

## Load data

In [2]:
start = time.time()

os.chdir("/workdir")
data_set = RunsDataSet(None, None, from_checkpoint=True, checkpoint_dir="./")

end = time.time()

print("Time taken to load the dataset:", round(end - start, 0), "seconds")

Loading dataset from checkpoints
Time taken to load the dataset: 59.0 seconds


In [3]:
all_jobs = data_set.get_all_jobs()
all_runs = data_set.get_all_runs()
jobs_runs_time = all_jobs.groupby("run_id").agg({"up_time": "sum", "start_ts": "min"}).reset_index()
runs_with_time = all_runs.merge(jobs_runs_time, left_on="id", right_on="run_id")
repos_list_1, repos_list_2 = get_tiers(data_set)
all_repos = data_set.get_all_repositories()
# paid tier repos
list_1_names = all_repos[all_repos.id.isin(repos_list_1)].full_name.to_list()
# free tier repos
list_2_names = all_repos[all_repos.id.isin(repos_list_2)].full_name.to_list()
optimizations = {}

## cancel in progress

In [4]:
collected_commits = []
with open("collected_commits_save_poins.json") as ccs:
    collected_commits += json.load(ccs)
with open("collected_commits_save_point_part2.json") as ccsp:
    collected_commits += json.load(ccsp)
#with open("collected_commits_save_point_part3.json") as ccsp:
#    collected_commits += json.load(ccsp)
with open("collected_commits_save_point_part4.json") as ccsp:
    collected_commits += json.load(ccsp)

In [5]:
cancel_usage = get_optimization_usage(collected_commits, optimization="cancel-in-progress")

### Impacted repos

In [6]:
created_with = set([x[0] for x in cancel_usage["created_with_optimization"]])
added = set([x[0] for x in cancel_usage["optimization_removed"]])
removed = set([x[0] for x in cancel_usage["optimization_added"]])
cip_adoption_1 = len((created_with|
    added|
    removed) & set(list_1_names))/len(list_1_names)*100
cip_adoption_2 = len((created_with|
    added|
    removed) & set(list_2_names))/len(list_2_names)*100

### Impact

In [7]:
optimized_runs1, possible_ids1, optimized_ids1 = cancel_in_progress_impact(data_set, repos_list_1, collected_commits)
optimized_runs2, possible_ids2, optimized_ids2 = cancel_in_progress_impact(data_set, repos_list_2, collected_commits)

In [8]:
cip_time2, cip_runs2, cip_cost2 = calc_cancel_in_progress_impact(data_set, possible_ids2, optimized_ids2, optimized_runs2)
cip_time1, cip_runs1, cip_cost1 = calc_cancel_in_progress_impact(data_set, possible_ids1, optimized_ids1, optimized_runs1)

In [9]:
optimizations["cancel_in_progress"]={
    "paid":{
        "adoption": cip_adoption_1,
        "impacted_runs": cip_runs1,
        "time_impact": cip_time1,
        "cost_impact": cip_cost1
        },
    "free":{
        "adoption": cip_adoption_2,
        "impacted_runs": cip_runs2,
        "time_impact": cip_time2,
        "cost_impact": cip_cost2
        } 
    }

## skip workflow

In [10]:
commits_dict = {}
with open("commits_messages_by_repo.json") as cmr:
    collected_messages = json.load(cmr)
    
for cm in collected_messages:
    if cm:
        repo_name = cm[0][0]
        if repo_name not in commits_dict:
            commits_dict[repo_name] = [x[2] for x in cm]

In [11]:
commits_dict_2 = {}
with open("scraped_commits_messages_part2.json") as cmr:
    collected_messages = json.load(cmr)
    
for cm in collected_messages:
    repo_name = cm[0]
    if repo_name in commits_dict_2:
        commits_dict_2[repo_name].append(cm[2])
    else:
        commits_dict_2[repo_name] = [cm[2]]

In [12]:
commits_dict_3 = {}

with open("collected_commits_messages_part3.json") as cmm:
    collected_messages = json.load(cmm)
    
for cm in collected_messages:
    repo_name = cm[0]
    if repo_name in commits_dict_3:
        commits_dict_3[repo_name].append(cm[2])
    else:
        commits_dict_3[repo_name] = [cm[2]]

In [13]:
commits_dict.update(commits_dict_2)
commits_dict.update(commits_dict_3)

In [14]:
commits_dict.keys()

dict_keys(['CJY0208/react-router-cache-route', 'cookpad/garage', 'jenkinsci/http-request-plugin', 'python273/vk_api', 'veusz/veusz', 'apache/echarts-doc', 'aeternity/aepp-sdk-js', 'spatie/async', 'vsn4ik/bootstrap-submenu', 'ebkr/r2modmanPlus', 'playpauseandstop/rororo', 'ldtteam/minecolonies', 'dotnetcore/FreeSql', 'stepjam/PyRep', 'grigorig/stcgal', 'KnpLabs/KnpMenuBundle', 'Nekmo/telegram-upload', 'heroku/salesforce-bulk', 'ant-design/create-react-app-antd', 'Malinskiy/adam', 'cyrilletuzi/vscode-angular-schematics', 'snoopwpf/snoopwpf', 'stotko/stdgpu', 'ralliejs/rallie', 'DistrictDataLabs/yellowbrick', 'nestjsx/nestjs-typeorm-paginate', 'wellyshen/react-cool-onclickoutside', 'stleamist/BetterSafariView', 'joncampbell123/dosbox-x', 'pubnub/ruby', 'robbievanleeuwen/section-properties', 'qiniu/php-sdk', 'n-elements/core', 'dotintent/react-native-ble-plx', 'facundoolano/google-play-scraper', 'tdiary/tdiary-core', 'ruby/psych', 'stern/stern', 'zalando/zappr', 'alexmojaki/snoop', 'geops/

In [15]:
len(commits_dict.keys())

834

In [16]:
skip_impact_1 = calc_skip_impact(data_set, repos_list_1, commits_dict)
skip_impact_2 = calc_skip_impact(data_set, repos_list_2, commits_dict)

In [17]:
optimizations["skip_workflow"]={
    "paid":{
        "adoption": skip_impact_1[0],
        "impacted_runs": skip_impact_1[1],
        "time_impact": skip_impact_1[2],
        "cost_impact": skip_impact_1[3]
    },
    "free":
    {
        "adoption": skip_impact_2[0],
        "impacted_runs": skip_impact_2[1],
        "time_impact": skip_impact_2[2],
        "cost_impact": skip_impact_2[3]
    }
}

## cache action

In [18]:
with open("pure_hashes.json") as phj:
    pure_hashes =  json.load(phj)

In [19]:
collected_commits = []
with open("collected_commits_save_poins.json") as ccs:
    collected_commits += json.load(ccs)

with open("collected_commits_save_point_part2.json") as ccsp:
    collected_commits += json.load(ccsp)

#with open("collected_commits_save_point_part3.json") as ccsp:
#    collected_commits += json.load(ccsp)

with open("collected_commits_save_point_part4.json") as ccsp:
    collected_commits += json.load(ccsp)

new_collected_commits = []
for commit in collected_commits:
    new_added = []
    new_deleted = []
    new_modified = []
    
    for c in commit["Added"]:
        if c["commit_hash"] in pure_hashes:
            new_added.append(c)
    
    for c in commit["Deleted"]:
        if c["commit_hash"] in pure_hashes:
            new_deleted.append(c)
    
    for c in commit["Modified"]:
        if c["commit_hash"] in pure_hashes:
            new_modified.append(c)
    
    if len(new_added) + len(new_deleted) +len(new_modified) != 0:
        new_collected_commits.append({
            "Added": new_added,
            "Deleted": new_deleted,
            "Modified": new_modified
        })

In [20]:
cache_usage = get_optimization_usage(collected_commits, optimization="cache@v")

### Impacted repos

In [21]:
created_with = set([x[0] for x in cache_usage["created_with_optimization"]])
added = set([x[0] for x in cache_usage["optimization_removed"]])
removed = set([x[0] for x in cache_usage["optimization_added"]])

In [22]:
adoption1 = len((created_with|
    added|
    removed) & set(list_1_names)) / len(list_1_names)

adoption2 = len((created_with|
    added|
    removed) & set(list_2_names)) / len(list_2_names)

### Prevalence and Impact

In [23]:
time_1, cost_1 = calc_cache_impact(collected_commits, data_set, repos_list_1)
time_2, cost_2 = calc_cache_impact(new_collected_commits, data_set, repos_list_2)

In [24]:
optimizations["cache"]={
    "paid":{
        "adoption": adoption1*100,
        "impacted_runs": 100.0, # by design
        "time_impact": time_1 * 100 * -1,
        "cost_impact": cost_1 * -1
    },
    "free":
    {
        "adoption": adoption2*100,
        "impacted_runs": 100.0, # by design
        "time_impact": time_2 * 100 * -1 / 2,
        "cost_impact": cost_2 * -1 / 2
    }
}

In [48]:
float(str(0.03467)[:5])*100

3.4000000000000004

## filtering target files

In [25]:
paths_ignore = get_optimization_ts(collected_commits, optimization="paths-ignore:")
paths = get_optimization_ts(collected_commits, optimization="paths:")
all_repos = data_set.get_all_repositories()
repos_1_names = all_repos[all_repos.id.isin(repos_list_1)].full_name.to_list()
repos_2_names = all_repos[all_repos.id.isin(repos_list_2)].full_name.to_list()

In [26]:
all_repos = data_set.get_all_repositories()
paths_ignore = get_optimization_ts(collected_commits, optimization="paths-ignore:")
paths = get_optimization_ts(collected_commits, optimization="paths:")
paths_ignore_usage = get_optimization_ts(collected_commits, optimization="paths-ignore:")
paths_usage = get_optimization_ts(collected_commits, optimization="paths:")
cancel_inprogress = get_optimization_ts(collected_commits, optimization="cancel-in-progress")

#
repos_1_names = all_repos[all_repos.id.isin(repos_list_1)].full_name.to_list()
repos_2_names = all_repos[all_repos.id.isin(repos_list_2)].full_name.to_list()

filter_adoption_1 = len(
    set([pi[0] for pi in paths_ignore if pi[0] in repos_1_names])|set([p[0] for p in paths if p[0] in repos_1_names])
    )/len(repos_1_names) * 100

filter_adoption_2 = len(
    set([pi[0] for pi in paths_ignore if pi[0] in repos_2_names])|set([p[0] for p in paths if p[0] in repos_2_names])
    )/len(repos_2_names) * 100

paths_ignore = [pi for pi in paths_ignore if pi[0] in repos_1_names]
paths = [pi for pi in paths if pi[0] in repos_1_names]
runs_repos = all_runs.merge(all_repos, left_on="repo_id", right_on="id")
runs_repos["start_ts"] = runs_repos.created_at.apply(lambda x: int(time.mktime(datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").timetuple())))
optimized_runs = []
possible_ids = []
for pi in paths_ignore:
    possible_runs = runs_repos[(runs_repos.full_name==pi[0]) & (runs_repos.workflow_file==".github/workflows/"+pi[1]) & (runs_repos.start_ts>pi[2]) & (runs_repos.start_ts<pi[3])]
    possible_runs = possible_runs.sort_values("start_ts")
    optimized_runs.extend(possible_runs[possible_runs.conclusion=="skipped"].id_x.to_list())
    possible_ids.extend(possible_runs.id_x.to_list())
for pi in paths:
    possible_runs = runs_repos[(runs_repos.full_name==pi[0]) & (runs_repos.workflow_file==".github/workflows/"+pi[1]) & (runs_repos.start_ts>pi[2]) & (runs_repos.start_ts<pi[3])]
    possible_runs = possible_runs.sort_values("start_ts")
    optimized_runs.extend(possible_runs[possible_runs.conclusion=="skipped"].id_x.to_list())
    possible_ids.extend(possible_runs.id_x.to_list())
runs_total_time = all_jobs.groupby("run_id").agg({"up_time": "sum"}).reset_index()
runs_repos = runs_repos.merge(runs_total_time, left_on="id_x", right_on="run_id")
workflow_mean = runs_repos[runs_repos.workflow_id.isin(runs_repos[runs_repos.id_x.isin(optimized_runs)].workflow_id)].groupby("workflow_id").agg({"up_time": "mean"}).reset_index()
runs_repos[runs_repos.id_x.isin(optimized_runs)].\
merge(workflow_mean, left_on="workflow_id", right_on="workflow_id").up_time_y.sum()/runs_repos[runs_repos.id_x.isin(possible_ids)].up_time.sum()*100
start_ts_min_max = runs_repos[runs_repos.id_x.isin(possible_ids)].groupby("repo_id").start_ts.agg(["min", "max"]).reset_index()
total_start_ts = 0
for i, row in start_ts_min_max.iterrows():
    total_start_ts += row["max"] - row["min"]
total_possible_time1 = total_start_ts/(12*30*24*3600)
save_cost1 = runs_repos[runs_repos.id_x.isin(possible_ids)].up_time.sum()/total_possible_time1/60 * 0.008*1.52*0.005
imp_runs_1 = len(optimized_runs)/len(possible_ids)*100
imp_time_1 = runs_repos[runs_repos.run_id.isin(optimized_runs)].up_time.sum()/(12*30*24*3600) / total_possible_time1

In [27]:
paths_ignore = get_optimization_ts(collected_commits, optimization="paths-ignore:")
paths = get_optimization_ts(collected_commits, optimization="paths:")
paths_ignore_usage = get_optimization_ts(collected_commits, optimization="paths-ignore:")
paths_usage = get_optimization_ts(collected_commits, optimization="paths:")
cancel_inprogress = get_optimization_ts(collected_commits, optimization="cancel-in-progress")

repos_1_names = all_repos[all_repos.id.isin(repos_list_1)].full_name.to_list()
repos_2_names = all_repos[all_repos.id.isin(repos_list_2)].full_name.to_list()

filter_adoption_1 = len(
    set([pi[0] for pi in paths_ignore if pi[0] in repos_1_names])|set([p[0] for p in paths if p[0] in repos_1_names])
    )/len(repos_1_names) * 100

filter_adoption_2 = len(
    set([pi[0] for pi in paths_ignore if pi[0] in repos_2_names])|set([p[0] for p in paths if p[0] in repos_2_names])
    )/len(repos_2_names) * 100

paths_ignore = [pi for pi in paths_ignore if pi[0] in repos_2_names]
paths = [pi for pi in paths if pi[0] in repos_2_names]
runs_repos = all_runs.merge(all_repos, left_on="repo_id", right_on="id")
runs_repos["start_ts"] = runs_repos.created_at.apply(lambda x: int(time.mktime(datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").timetuple())))
optimized_runs = []
possible_ids = []
for pi in paths_ignore:
    possible_runs = runs_repos[(runs_repos.full_name==pi[0]) & (runs_repos.workflow_file==".github/workflows/"+pi[1]) & (runs_repos.start_ts>pi[2]) & (runs_repos.start_ts<pi[3])]
    possible_runs = possible_runs.sort_values("start_ts")
    optimized_runs.extend(possible_runs[possible_runs.conclusion=="skipped"].id_x.to_list())
    possible_ids.extend(possible_runs.id_x.to_list())
for pi in paths:
    possible_runs = runs_repos[(runs_repos.full_name==pi[0]) & (runs_repos.workflow_file==".github/workflows/"+pi[1]) & (runs_repos.start_ts>pi[2]) & (runs_repos.start_ts<pi[3])]
    possible_runs = possible_runs.sort_values("start_ts")
    optimized_runs.extend(possible_runs[possible_runs.conclusion=="skipped"].id_x.to_list())
    possible_ids.extend(possible_runs.id_x.to_list())
runs_total_time = all_jobs.groupby("run_id").agg({"up_time": "sum"}).reset_index()
runs_repos = runs_repos.merge(runs_total_time, left_on="id_x", right_on="run_id")
workflow_mean = runs_repos[runs_repos.workflow_id.isin(runs_repos[runs_repos.id_x.isin(optimized_runs)].workflow_id)].groupby("workflow_id").agg({"up_time": "mean"}).reset_index()
runs_repos[runs_repos.id_x.isin(optimized_runs)].\
merge(workflow_mean, left_on="workflow_id", right_on="workflow_id").up_time_y.sum()/runs_repos[runs_repos.id_x.isin(possible_ids)].up_time.sum()*100
start_ts_min_max = runs_repos[runs_repos.id_x.isin(possible_ids)].groupby("repo_id").start_ts.agg(["min", "max"]).reset_index()
total_start_ts = 0
for i, row in start_ts_min_max.iterrows():
    total_start_ts += row["max"] - row["min"]
total_possible_time2 = total_start_ts/(12*30*24*3600)
save_cost2 = runs_repos[runs_repos.id_x.isin(possible_ids)].up_time.sum()/total_possible_time2/60 * 0.008*1.52*0.005
imp_runs_2 = len(optimized_runs)/len(possible_ids)*100
imp_time_2 = runs_repos[runs_repos.run_id.isin(optimized_runs)].up_time.sum()/(12*30*24*3600) / total_possible_time2

In [28]:
optimizations["filtering_target_files"]={
    "paid":{
        "adoption": filter_adoption_1,
        "impacted_runs": imp_runs_1,
        "time_impact": imp_time_1,
        "cost_impact": save_cost1
    },
    "free":
    {
        "adoption": filter_adoption_2,
        "impacted_runs": imp_runs_2,
        "time_impact": imp_time_1,
        "cost_impact": save_cost2
    }
}

## fail fast option

In [29]:
fail_fast = get_optimization_usage(collected_commits, optimization="fail-fast:false")
repos_1_names = all_repos[all_repos.id.isin(repos_list_1)].full_name.to_list()
len(set([x[0] for x in fail_fast["optimization_removed"] if x[0] in repos_1_names]))

16

In [30]:
len_off1 = len(set([x[0] for x in fail_fast["created_with_optimization"] if x[0] in list_1_names])-set([x[0] for x in fail_fast["optimization_removed"] if x[0] in list_1_names]))
len_off2 = len(set([x[0] for x in fail_fast["created_with_optimization"] if x[0] in list_2_names])-set([x[0] for x in fail_fast["optimization_removed"] if x[0] in list_2_names]))

In [31]:
adoption_1 = 100 - len_off1/len(list_1_names)*100
adoption_2 = 100 - len_off2/len(list_2_names)*100

In [32]:
fail_fast_ts = get_optimization_ts(collected_commits, optimization="fail-fast:false")

In [33]:
all_runs = data_set.get_all_runs()
all_jobs = data_set.get_all_jobs()
all_repos = data_set.get_all_repositories()
runs_total_time = all_jobs.groupby("run_id").agg({"up_time": "sum"}).reset_index()
all_runs = all_runs.merge(runs_total_time, left_on="id", right_on="run_id")
all_runs["start_ts"] = all_runs.created_at.apply(lambda x: int(time.mktime(datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").timetuple())))
runs_repos = all_runs.merge(all_repos[["id", "full_name"]], left_on="repo_id", right_on="id")

In [34]:
fail_fast_false = []
for pi in fail_fast_ts:
    if pi[0] in list_2_names:
        possible_runs = runs_repos[(runs_repos.full_name==pi[0]) & (runs_repos.workflow_file==".github/workflows/"+pi[1]) & (runs_repos.start_ts>pi[2]) & (runs_repos.start_ts<pi[3])]
        possible_runs = possible_runs.sort_values("start_ts")
        fail_fast_false.extend(possible_runs.id_x.to_list())
        #optimized_runs.extend(possible_runs[possible_runs.conclusion=="skipped"].id_x.to_list())
        #possible_ids.extend(possible_runs.id_x.to_list())
no_fail_fast_runs = all_runs[~all_runs.id.isin(fail_fast_false)]
jobs_matrix = all_jobs[((all_jobs.name.str.contains("\(")) | (all_jobs.name.str.contains("matrix")))]
optimized_runs = no_fail_fast_runs[(no_fail_fast_runs.id.isin(jobs_matrix.run_id)) & (no_fail_fast_runs.conclusion=="failure")]
no_fail_fast_success = no_fail_fast_runs[no_fail_fast_runs.conclusion=="success"]
saved_time = []

for i, row in optimized_runs.iterrows():
    start_ts = row["start_ts"]
    success_df = no_fail_fast_success[(no_fail_fast_success.workflow_id == row["workflow_id"])]
    if success_df.shape[0] != 0:
        success_time = success_df[(success_df.workflow_id == row["workflow_id"])].up_time.to_list()[0]
        saved_time.append(success_time - row["up_time"])
saved_time2 = sum(saved_time)/(no_fail_fast_runs.up_time.sum()+sum(saved_time))*100
impacted_runs2 = optimized_runs.shape[0] / all_runs.shape[0]
sum_start_ts = 0
for i, row in no_fail_fast_runs.groupby("repo_id").start_ts.agg(["min", "max"]).reset_index().iterrows():
    sum_start_ts += row["max"] - row["min"]
sum_start_ts = sum_start_ts/(12*30*24*3600)
save_cost2 = sum(saved_time)/sum_start_ts * 0.008 * 1.52 / 60

In [35]:
fail_fast_false = []
for pi in fail_fast_ts:
    if pi[0] in list_1_names:
        possible_runs = runs_repos[(runs_repos.full_name==pi[0]) & (runs_repos.workflow_file==".github/workflows/"+pi[1]) & (runs_repos.start_ts>pi[2]) & (runs_repos.start_ts<pi[3])]
        possible_runs = possible_runs.sort_values("start_ts")
        fail_fast_false.extend(possible_runs.id_x.to_list())
        #optimized_runs.extend(possible_runs[possible_runs.conclusion=="skipped"].id_x.to_list())
        #possible_ids.extend(possible_runs.id_x.to_list())
no_fail_fast_runs = all_runs[~all_runs.id.isin(fail_fast_false)]
jobs_matrix = all_jobs[((all_jobs.name.str.contains("\(")) | (all_jobs.name.str.contains("matrix")))]
optimized_runs = no_fail_fast_runs[(no_fail_fast_runs.id.isin(jobs_matrix.run_id)) & (no_fail_fast_runs.conclusion=="failure")]
no_fail_fast_success = no_fail_fast_runs[no_fail_fast_runs.conclusion=="success"]
saved_time = []

for i, row in optimized_runs.iterrows():
    start_ts = row["start_ts"]
    success_df = no_fail_fast_success[(no_fail_fast_success.workflow_id == row["workflow_id"])]
    if success_df.shape[0] != 0:
        success_time = success_df[(success_df.workflow_id == row["workflow_id"])].up_time.to_list()[0]
        saved_time.append(success_time - row["up_time"])
saved_time1 = sum(saved_time)/(no_fail_fast_runs.up_time.sum()+sum(saved_time))*100
impacted_runs1 = optimized_runs.shape[0] / all_runs.shape[0]
sum_start_ts = 0
for i, row in no_fail_fast_runs.groupby("repo_id").start_ts.agg(["min", "max"]).reset_index().iterrows():
    sum_start_ts += row["max"] - row["min"]
sum_start_ts = sum_start_ts/(12*30*24*3600)
save_cost1 = sum(saved_time)/sum_start_ts * 0.008 * 1.52 / 60

In [36]:
optimizations["fail_fast"]={
    "paid":{
        "adoption": adoption_1,
        "impacted_runs": impacted_runs1*100,
        "time_impact": saved_time1,
        "cost_impact": save_cost1
    },
    "free":
    {
        "adoption": adoption_2,
        "impacted_runs": impacted_runs2*100,
        "time_impact": saved_time2,
        "cost_impact": save_cost2
    }
}

## Vm minutes

In [37]:
vm_minutes = get_optimization_usage_avm(collected_commits, optimization="timeout-minutes")
timeout_ = get_optimization_ts_avm(collected_commits, optimization="timeout-minutes:")

In [38]:
all_repos = data_set.get_all_repositories()
all_runs = data_set.get_all_runs()
all_runs = all_runs[all_runs.repo_id.isin(repos_list_2)]
all_jobs = data_set.get_all_jobs()

all_runs["start_ts"] = all_runs.created_at.apply(lambda x: int(time.mktime(datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").timetuple())))
runs_repos = all_runs.merge(all_repos, left_on="repo_id", right_on="id")

optimized_runs_ids = []
saved_time = []
jobs_ids = []
all_possible = []
for c in timeout_:
    candidate_runs = runs_repos[(runs_repos.full_name==c[0]) & 
            (runs_repos.workflow_file==".github/workflows/"+c[1]) & 
            (runs_repos.start_ts > c[2])& 
            (runs_repos.start_ts < c[3])]
    timeout_value = c[4]
    timed_out_jobs = all_jobs[(all_jobs.run_id.isin(candidate_runs.id_x)) & (all_jobs.up_time > timeout_value*60-59) & (all_jobs.up_time<timeout_value*60+59)]
    max_job_time = all_jobs[all_jobs.run_id.isin(runs_repos[(runs_repos.full_name==c[0])&(runs_repos.workflow_file==".github/workflows/"+c[1])].id_x.to_list())].up_time.max()
    average_high = all_jobs[(all_jobs.run_id.isin(runs_repos[(runs_repos.full_name==c[0])&
                                                            (runs_repos.workflow_file==".github/workflows/"+c[1])].id_x.to_list()))&
                           (all_jobs.up_time>timeout_value*60+30)].up_time.mean()
    probability_high = all_jobs[(all_jobs.run_id.isin(runs_repos[(runs_repos.full_name==c[0])&
                                                            (runs_repos.workflow_file==".github/workflows/"+c[1])].id_x.to_list()))&
                           (all_jobs.up_time>timeout_value*60+30)].shape[0]/(all_jobs[(all_jobs.run_id.isin(runs_repos[(runs_repos.full_name==c[0])&
                                                            (runs_repos.workflow_file==".github/workflows/"+c[1])].id_x.to_list()))].shape[0]+0.1)
    all_possible.extend(candidate_runs.id_x.to_list())
    if not np.isnan(average_high):
        for i, row in timed_out_jobs.iterrows():
            saved_time.append((average_high - timeout_value*60))
            jobs_ids.append(row["id"])

In [39]:
impact_vm_time2 = sum(saved_time) / (all_jobs[all_jobs.run_id.isin(all_possible)].up_time.sum()+sum(saved_time)) *100
start_ts_min_max = runs_repos[runs_repos.id_x.isin(all_possible)].groupby("repo_id").start_ts.agg(["min", "max"])
total_start_ts = 0
for i, row in start_ts_min_max.iterrows():
    total_start_ts += row["max"] - row["min"]
impact_cost2 = sum(saved_time) / (total_start_ts/(12*30*24*3600)) * 1.52 * 0.008 /60
impact_runs2 =len(all_possible)/runs_repos.shape[0]*100
#cache_usage = get_optimization_usage(collected_commits, "concurrency:")
created_with = set([(x[0],x[1]) for x in cache_usage["created_with_optimization"]])
added = set([(x[0],x[1]) for x in cache_usage["optimization_removed"]])
removed = set([(x[0],x[1]) for x in cache_usage["optimization_added"]])
len((created_with|
    added|
    removed))
#cache_usage = get_optimization_usage(collected_commits, "concurrency:")
created_with = set([x[0] for x in vm_minutes["created_with_optimization"]])
added = set([x[0] for x in vm_minutes["optimization_removed"]])
removed = set([x[0] for x in vm_minutes["optimization_added"]])
adoption2 = len((created_with|
    added|
    removed) & set(list_2_names)) / len(list_2_names) * 100

In [40]:
all_repos = data_set.get_all_repositories()
all_runs = data_set.get_all_runs()
all_runs = all_runs[all_runs.repo_id.isin(repos_list_1)]
all_jobs = data_set.get_all_jobs()

all_runs["start_ts"] = all_runs.created_at.apply(lambda x: int(time.mktime(datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ").timetuple())))
runs_repos = all_runs.merge(all_repos, left_on="repo_id", right_on="id")

optimized_runs_ids = []
saved_time = []
jobs_ids = []
all_possible = []
for c in timeout_:
    candidate_runs = runs_repos[(runs_repos.full_name==c[0]) & 
            (runs_repos.workflow_file==".github/workflows/"+c[1]) & 
            (runs_repos.start_ts > c[2])& 
            (runs_repos.start_ts < c[3])]
    timeout_value = c[4]
    timed_out_jobs = all_jobs[(all_jobs.run_id.isin(candidate_runs.id_x)) & (all_jobs.up_time > timeout_value*60-59) & (all_jobs.up_time<timeout_value*60+59)]
    max_job_time = all_jobs[all_jobs.run_id.isin(runs_repos[(runs_repos.full_name==c[0])&(runs_repos.workflow_file==".github/workflows/"+c[1])].id_x.to_list())].up_time.max()
    average_high = all_jobs[(all_jobs.run_id.isin(runs_repos[(runs_repos.full_name==c[0])&
                                                            (runs_repos.workflow_file==".github/workflows/"+c[1])].id_x.to_list()))&
                           (all_jobs.up_time>timeout_value*60+30)].up_time.mean()
    probability_high = all_jobs[(all_jobs.run_id.isin(runs_repos[(runs_repos.full_name==c[0])&
                                                            (runs_repos.workflow_file==".github/workflows/"+c[1])].id_x.to_list()))&
                           (all_jobs.up_time>timeout_value*60+30)].shape[0]/(all_jobs[(all_jobs.run_id.isin(runs_repos[(runs_repos.full_name==c[0])&
                                                            (runs_repos.workflow_file==".github/workflows/"+c[1])].id_x.to_list()))].shape[0]+0.1)
    all_possible.extend(candidate_runs.id_x.to_list())
    if not np.isnan(average_high):
        for i, row in timed_out_jobs.iterrows():
            saved_time.append((average_high - timeout_value*60))
            jobs_ids.append(row["id"])

In [41]:
impact_vm_time1 = sum(saved_time) / (all_jobs[all_jobs.run_id.isin(all_possible)].up_time.sum()+sum(saved_time)) *100
start_ts_min_max = runs_repos[runs_repos.id_x.isin(all_possible)].groupby("repo_id").start_ts.agg(["min", "max"])
total_start_ts = 0
for i, row in start_ts_min_max.iterrows():
    total_start_ts += row["max"] - row["min"]
impact_cost1 = sum(saved_time) / (total_start_ts/(12*30*24*3600)) * 1.52 * 0.008 /60
impact_runs1 =len(all_possible)/runs_repos.shape[0]*100
#cache_usage = get_optimization_usage(collected_commits, "concurrency:")
created_with = set([(x[0],x[1]) for x in cache_usage["created_with_optimization"]])
added = set([(x[0],x[1]) for x in cache_usage["optimization_removed"]])
removed = set([(x[0],x[1]) for x in cache_usage["optimization_added"]])
len((created_with|
    added|
    removed))
#cache_usage = get_optimization_usage(collected_commits, "concurrency:")
created_with = set([x[0] for x in vm_minutes["created_with_optimization"]])
added = set([x[0] for x in vm_minutes["optimization_removed"]])
removed = set([x[0] for x in vm_minutes["optimization_added"]])
adoption1 = len((created_with|
    added|
    removed) & set(list_1_names)) / len(list_1_names) * 100

In [42]:
optimizations["custom_timeout"]={
    "paid":{
        "adoption": adoption1,
        "impacted_runs": impacted_runs1*100,
        "time_impact": impact_vm_time1,
        "cost_impact": impact_cost1
    },
    "free":
    {
        "adoption": adoption2,
        "impacted_runs": impacted_runs2*100,
        "time_impact": impact_vm_time2,
        "cost_impact": impact_cost2
    }
}

In [43]:
print("{:<30} {:<24} {:<24} {:<24} {:<24}".format(
        "",
        "Adoption rate %",
        "Impacted runs %",
        "Impact on VM-time %",
        "Annual cost delta $"
         ))
print(" "*30 + "-"*96)
print("{:<30} {:<12} {:<12} {:<12} {:<12} {:<12} {:<12} {:<12} {:<12}".format(
        "Conclusion",
        "Paid",
        "Free",
        "Paid",
        "Free",
        "Paid",
        "Free",
        "Paid",
        "Free"
         ))
print("-"*126)
for op_name in ["cache", "fail_fast", "cancel_in_progress", "skip_workflow", "filtering_target_files", "custom_timeout"]:
    o = optimizations[op_name]
    print("{:<30} {:<12} {:<12} {:<12} {:<12} -{:<12} -{:<12} -{:<12} -{:<12}".format(
        op_name,
        round(o["paid"]["adoption"], 1),
        round(o["free"]["adoption"], 1),
        round(o["paid"]["impacted_runs"], 1),
        round(o["free"]["impacted_runs"], 1),
        round(o["paid"]["time_impact"], 1),
        round(o["free"]["time_impact"], 1),
        round(o["paid"]["cost_impact"], 2),
        round(o["free"]["cost_impact"], 2),
         ))


                               Adoption rate %          Impacted runs %          Impact on VM-time %      Annual cost delta $     
                              ------------------------------------------------------------------------------------------------
Conclusion                     Paid         Free         Paid         Free         Paid         Free         Paid         Free        
------------------------------------------------------------------------------------------------------------------------------
cache                          32.9         17.8         100.0        100.0        -3.5          -6.2          -22.45        -0.61        
fail_fast                      75.9         84.5         3.1          4.7          -1.5          -2.0          -2.13         -4.22        
cancel_in_progress             10.1         1.9          9.2          1.7          -4.1          -1.6          -62.63        -0.52        
skip_workflow                  9.5          4.6          0.1   