In [217]:
import sys
from collections import defaultdict
import pprint as pp
import numpy as np
import pandas as pd
import scipy.optimize as optimize

In [220]:
df = pd.read_csv('tasks.csv', dtype={'cpu_n': 'float'}, names=['jeditaskid', 'processingtype', 'transformation', 'atlrel', 'site',
                                                               'njobs', 'cpuevt_avg', 'cpuevt_rms', 'wallevt_avg',
                                                               'wallevt_rms', 'cpu', 'wc', 'cores', 'jevts', 'eff'],
                nrows = 500000)

In [221]:
print len(df)

500000


In [222]:
smap = dict()   # Maps site name to a numerical index
i = 0
for s in set(df['site']):
    smap[s] = i
    i += 1

nsites = i   # Total number of sites
k_ini = np.ones(i)   # Initialises CPU factors for all sites to one

In [223]:
grouped = df.groupby(['jeditaskid'], as_index=False)
#data = np.zeros((len(grouped), nsites))
data2 = list()   # A list with each item corresponding to a task and containing a list of (site, value) tuples,
                 # where site is the numerical index for the site and value is the CPU time per event on that site

In [224]:
i = 0
tmap = list()   # Maps the item number in data2 to the corresponding task id
for j, g in grouped:
    tmap.append(j)
    tmp = list()
    for row in g.itertuples():
        site = row[5]
        n = smap[site]
        x = row[7]
#        data[i][n] = x
        tmp.append((n, x))
    data2.append(tmp)
    i += 1

In [225]:
data2[726]

[(110, 1017.7960879368657)]

In [226]:
task = tmap[726]   # Anomalous task where some jobs use zero CPU time per event
site = data2[726][0][0]
value = data2[726][0][1]
name = 'X'
for s in smap.keys():
    if smap[s] == site:
        name = s
print task
print name
print value
df[df['jeditaskid'] == 5433371]

4638481
BOINC
1017.79608794


Unnamed: 0,jeditaskid,processingtype,transformation,atlrel,site,njobs,cpuevt_avg,cpuevt_rms,wallevt_avg,wallevt_rms,cpu,wc,cores,jevts,eff
517,5433371,simul,Sim_tf.py,Atlas-19.2.3,BNL_PROD_MCORE,160,1088.539938,122.8858,1224.5665,179.522585,17416639,19593064,8,16000,0.897814
2638,5433371,simul,Sim_tf.py,Atlas-19.2.3,TRIUMF_MCORE,11,0.0,0.0,1337.890909,295.196794,0,1471680,8,1100,0.0
11687,5433371,simul,Sim_tf.py,Atlas-19.2.3,FZK-LCG2_MCORE,13,1033.809231,80.894335,1225.643077,125.829699,1343952,1593336,8,1300,0.846963
36980,5433371,simul,Sim_tf.py,Atlas-19.2.3,INFN-T1_MCORE,16,948.618125,143.895653,1022.115,155.677089,1517789,1635384,8,1600,0.92828


In [227]:
def func(k):
    total = 0.
    c = 0
    for task in data2:
        c += 1
        if len(task) == 1: continue
        sites = [d[0] for d in task]
        values = [d[1] for d in task]
        for i in range(len(sites)):
            j = i + 1
            if j == len(sites): j = 0
            ratio = values[i] / values[j]
            if np.isinf(ratio) | np.isnan(ratio): continue
            delta = (ratio - k[sites[i]] / k[sites[j]])**2
            total = total + delta
    return total

In [228]:
func(k_ini)

1089871.2059414822

In [229]:
ref = smap['CERN-PROD']
cons = ({'type': 'eq', 'fun': lambda x: x[ref] - 1})

In [230]:
optimize.minimize(func, k_ini, constraints=cons)

  status: 9
 success: False
    njev: 101
    nfev: 32246
     fun: 1029015.2866001579
       x: array([ 0.99010393,  0.77798174,  1.57517015,  0.73617417,  0.66829716,
        1.62458858,  0.91367799,  1.39378476,  1.15150627,  0.83418174,
        2.84076998,  0.72112314,  1.52796301,  0.76258116,  0.7983303 ,
        1.03959456,  0.69494368,  0.46794765,  1.82401368,  1.05424914,
        1.03570228,  1.14886343,  1.40275536,  1.74535718,  1.09477043,
        1.21841097,  3.13094141,  0.65185021,  1.0553996 ,  0.83399162,
        1.03350385,  0.65541412,  1.35800391,  1.08193145,  0.62580977,
        0.95698212,  0.99246571,  0.70106431,  1.09547562,  0.83711118,
        0.79946823,  0.5102981 ,  1.69009134,  1.39530848,  1.30809558,
        1.39284004,  0.98551581,  0.90662288,  1.62866455,  1.03941606,
        1.19166113,  1.17832968,  1.09622518,  0.70742403,  1.32188498,
        0.88178341,  0.89014159,  1.00101464,  0.87170574,  0.76431567,
        0.74366864,  6.71077582,  1.079