# SpecMetrics - Processing local files

In [24]:
import glob
import re
import numpy as np
import pandas as pd
import datetime
import json
from tqdm import tqdm

def filename(path):
    return path.split("/")[-1]

def file_info(path):
    file = filename(path)
    match = re.search("(\d{17})-(.+)-(\w{8})\.json", file)
    date = datetime.datetime.strptime(match.group(1), "%Y%m%d%H%M%S%f")
    branch = match.group(2)
    sha = match.group(3)
    return (date, branch, sha)

def flatten_run(run_id, data, date=None, git_repo=None, git_sha=None, git_branch=None):
    return {
        "run_id": run_id,
        "around_queries_count": data["around"]["queries_count"],
        "around_queries_duration": data["around"]["queries_duration"],
        "around_requests_count": data["around"]["requests_count"],
        "around_requests_duration": data["around"]["requests_duration"],
        "duration": data["duration"],
        "example_count": data["example_count"],
        "failure_count": data["failure_count"],
        "pending_count": data["pending_count"],
        "seed": data["seed"],
        "seed_used": data["seed_used"],
        "system_hostname": data["system"]["hostname"],
        "git_repo": git_repo or data["git"]["repo"] or data["git"]["remote_origin"],
        "git_sha": git_sha or data["git"]["sha"],
        "git_branch": git_branch or data["git"]["branch"],
        "date": date
    }

def build_dfs(dir, git_repo=None):
    files = glob.glob(dir + "/*.json")
    runs = []
    examples = []
    for file in tqdm(files):
        run_id = file.split("/")[-1].replace(".json", "")
        (date, _, _) = file_info(file)
        data = load_data(file)
        runs.append(flatten_run(run_id, data, date=date, git_repo=git_repo))
        examples += flatten_examples(data, run_id=run_id)

    examples_df = pd.DataFrame.from_dict(examples)
    runs_df = pd.DataFrame.from_dict(runs)
    return (examples_df, runs_df)

def load_data(file):
    with open(file) as data_file:    
        return json.load(data_file)
    
def flatten_example(example, run_id=None):
    p_items = example["file_path"].split("/")[2:] # just remove "." and "spec"
    return {
        "run_id": run_id,
        "description": example["description"],
        "dir_0": p_items[0],
        "dir_1": p_items[1] if len(p_items) > 2 else None,
        "dir_2": p_items[2] if len(p_items) > 3 else None,
        "dir_3": p_items[3] if len(p_items) > 4 else None,
        "file_name": p_items[-1],
        "line_number": example["line_number"],
        "run_time": example["execution_result"]["run_time"],
        "status": example["execution_result"]["status"],
        "queries_count": example["queries_count"],
        "queries_duration": example["queries_duration"],
        "requests_count": example["requests_count"],
        "requests_duration": example["requests_duration"]
    }

def flatten_examples(data, run_id=None):
    return [flatten_example(example, run_id=run_id) for example in data["examples"]]

def all_paths(data):
    return set([example["file_path"] for example in data["examples"]])

def dirs_at_level(data, level):
    paths = all_paths(data)
    return set([path.split("/")[level] for path in paths])

def build_examples_df(files):
    examples = []
    for file in tqdm(files):
        data = load_data(file)
        run_id = file.split("/")[-1].replace(".json", "")
        examples += flatten_examples(data, run_id=run_id)
    return pd.DataFrame.from_dict(examples)

In [25]:
DATA_DIR = "/Volumes/GiveMeMore/spec-metrics/jobteaser-jobteaser"
(examples_df, runs_df) = build_dfs(DATA_DIR, git_repo="jobteaser/jobteaser")

100%|██████████| 873/873 [00:36<00:00, 24.22it/s]


In [40]:
runs_df.index = runs_df.date
(runs_df[(runs_df.git_branch == "develop") & (runs_df.example_count > 5000)]).ix[-1]

around_queries_count                                           44098
around_queries_duration                                      21.6056
around_requests_count                                              0
around_requests_duration                                           0
date                                      2016-08-26 14:39:53.637000
duration                                                     2940.36
example_count                                                   5615
failure_count                                                      3
git_branch                                                   develop
git_repo                                         jobteaser/jobteaser
git_sha                     edeb558cea7c606629e496b407d9907df8ca9aae
pending_count                                                     16
run_id                            20160826143953637-develop-edeb558c
seed                                                           56585
seed_used                         

In [39]:
df = examples_df[examples_df.run_id == "20160826143953637-develop-edeb558c"].copy()
df.groupby("dir_0").sum()

Unnamed: 0_level_0,line_number,queries_count,queries_duration,requests_count,requests_duration,run_time
dir_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
acceptance,3489,38051,28.691973,460,67.713842,452.380154
chewy,20,0,0.0,0,0.0,0.013094
config,3584,534,0.242728,142,6.647505,11.404211
controllers,308946,131725,52.980923,1145,7.903778,461.36423
decorators,19536,11217,2.601925,0,0.0,41.322151
domain,1678,8767,4.134288,2,0.000737,37.470423
helpers,7778,1072,0.389056,0,0.0,3.994248
infrastructure,12848,2639,2.600759,0,0.0,13.162172
inputs,286,13,0.004575,0,0.0,0.141603
integration,4309,3977,1.689828,24,0.005697,14.024816
