# Extracts trace data from Elasticsearch and saves it in HDF5 files


In [1]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
import pandas as pd
import numpy as np


### select sites and time periods


In [2]:
start_date = '2018-08-01 00:00:00'
end_date = '2018-09-01 00:00:00'
site = 'MWT2'

print("start:", start_date, "end:", end_date)
start = int(pd.Timestamp(start_date).timestamp())
end = int(pd.Timestamp(end_date).timestamp())


start: 2018-08-01 00:00:00 end: 2018-09-01 00:00:00


### select kind of traces to export and name your dataset


In [3]:
dataset = 'prod_AUG'
trace_query = {
    "_source": ["time_start", "time_end", "site", "event", "scope", "filename", "filesize", "pandaID"],
    'query': {
        'bool': {
            'must': [
                {'range': {'time_start': {'gte': start, 'lt': end}}},
                {'exists': {"field": "filename"}},
                {'exists': {"field": "pandaID"}},
                {'wildcard': {'site': site + '*'}},
                # {'wildcard': {'filename': 'EVNT*'}},
                #                 {'wildcard': {'event': 'get_sm*'}},
                {'term': {'event': 'get_sm'}}
                # {'term': {'event': 'get_sm_a'}},
                # {'term': {'event': 'download'}},
            ]
        }
    }
}


es = Elasticsearch(['atlas-kibana.mwt2.org:9200'], timeout=60)


### Does scan


In [4]:
scroll = scan(client=es, index="traces", query=trace_query)
count = 0
requests = []
for res in scroll:
    r = res['_source']
    requests.append([r['scope'] + ':' + r['filename'], r['filesize'], r['time_start'], r['pandaID']])

    if not count % 100000:
        print(count)
#     if count>300:
#         break
    count = count + 1

all_accesses = pd.DataFrame(requests).sort_values(3)
all_accesses.columns = ['filename', 'filesize', 'transfer_start', 'pandaid']

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000


### finds wall time, cores for all jobs.


In [5]:
pids = all_accesses.pandaid.unique()
minpid = all_accesses.pandaid.min()
maxpid = all_accesses.pandaid.max()
print("Unique PandaIDs", len(pids), "min", minpid, "max", maxpid)


Unique PandaIDs 457593 min 4009106692 max 4043589941


In [6]:
job_query = {
    "_source": ["actualcorecount", "wall_time", "pandaid"],
    'query': {
        'bool': {
            'must': [
                {'range': {'pandaid': {'gte': int(minpid), 'lt': int(maxpid)}}},
                {'wildcard': {'computingsite': site + '*'}}
            ]
        }
    }
}

scroll = scan(client=es, index="jobs", query=job_query)
count = 0
requests = []
for res in scroll:
    r = res['_source']
    requests.append([r['pandaid'], r['wall_time'], r['actualcorecount']])

    if not count % 100000:
        print(count)
#     if count>300:
#         break
    count = count + 1


0
100000
200000
300000
400000
500000
600000
700000
800000


In [7]:
all_jobs = pd.DataFrame(requests).sort_values(0)
all_jobs.columns = ['pandaid', 'wall_time', 'cores']
all_accesses.set_index('pandaid', drop=True, inplace=True)
all_jobs.set_index('pandaid', drop=True, inplace=True)

all=all_accesses.join(all_jobs,how='inner')
all.describe()

Unnamed: 0,filesize,transfer_start,wall_time,cores
count,4125484.0,4125484.0,4125484.0,4125091.0
mean,1908431000.0,1534565000.0,16189.03,5.924798
std,1668771000.0,821933.5,13901.29,3.196866
min,9723.0,1533082000.0,-5849.0,1.0
25%,239860100.0,1533845000.0,4407.0,1.0
50%,2024002000.0,1534643000.0,18439.0,8.0
75%,3156628000.0,1535348000.0,22467.0,8.0
max,12014360000.0,1535760000.0,446837.0,8.0


In [8]:
all.set_index('filename', drop=True, inplace=True)
all.to_hdf(site + '_' + dataset + '.h5', key=site, mode='w', complevel=1)

(4125484, 5)
Done.


In [9]:
print('traces:',all_accesses.shape[0])
print('jobs:',all_jobs.shape[0])
print('merged:',all.shape[0])
print('Done.')

traces: 4137233
jobs: 810667
merged: 4125484
Done.
