# Helper to load cleaned workflow jobs data

Output:
* `jobs`: All rows in the jobs data
* `e2e_jobs`: Only test jobs
* `e2e_jobs_no_steps`: Only one row per test jobs, deduping across steps.

In [None]:
# Load and parse raw data.
with open('../data/cleaned/jobs.json') as f:
    jobs = pd.read_json(
        f, 
        convert_dates=[
            'job_started_at',
            'job_completed_at',
            'step_started_at',
            'step_completed_at',
            'job_started_date',
            ],
        )
print('Loaded jobs with %d rows. Columns:' % (jobs.shape[0],))
print(jobs.dtypes)

In [None]:
# Filter down to test jobs
e2e_jobs = jobs[jobs['e2e-composite'].notnull()]
print('Filtered down to test jobs with %d rows.' % (e2e_jobs.shape[0],))

In [None]:
# Each job has multiple steps. Drop down to one row per job.
e2e_jobs_no_steps = e2e_jobs[[
    'workflow_id',
    'workflow_run_attempt',
    'job_status',
    'job_conclusion',
    'job_started_at',
    'job_completed_at',
    'job_name',
    'job_started_date',
    'job_duration_minutes',
    'workflow_attempt_uid',
    'e2e-composite',
    'e2e-flavor']].drop_duplicates()

print('Dropped down to one row per job with %d rows. Columns:' % (e2e_jobs_no_steps.shape[0],))
print(e2e_jobs_no_steps.dtypes)