# Combine results from different fits

Having fitted a set of speed factors from different but uniform samples, and knowing that the absolute scale is not determined, before comparing values from different samples it is necessary to rescale them to the same scale. My idea is to minimise the modulus of the difference between the first vector and the N-th vector multiplied by a factor, and rescale it accordingly.

In [1]:
import math
import csv
from collections import defaultdict
import numpy as np

from __future__ import division

%matplotlib inline

Let's load in memory the test results from different files (assuming they are homogeneous, i.e. they are different measurements of the same quantities under the same conditions).

In [2]:
n_files = 10
fact = dict()
factors = defaultdict(dict)   # Dict containing all k_factors fitted for a given CPU, indexed by cpu type and file number
factors_err = defaultdict(dict)   # Dict containing all k_factors fitted for a given CPU, indexed by cpu type and file number
for i in range(n_files):
    input_pattern = 'results_cputype_task_cpu_sub_%s_pile_0.001.csv'
    f = open(input_pattern % (str(i)))
    for line in f:
        cpu, k, k_err = line.strip().split(',')
        k = float(k)
        factors[cpu][i] = k
        factors_err[cpu][i] = k_err
    f.close()

A series of "rescaling functions" are defined and tested to see how much they differ in their effects.

In [3]:
def dot(x, y):   # a special inner product that skips components where one of the vectors has a NaN
    result = 0.
    for i in range(len(x)):
        if not np.isnan(x[i]) and not np.isnan(y[i]):
            result += x[i] * y[i]
    return result

In [4]:
def rescale_dist(x, y):   # rescales y such that the distance to x is minimised
    y2 = y * x / x   # to put nan in elements where x has NaN
    alpha = np.nansum(x * y) / np.nansum(y2 * y2)
    return y * alpha

In [5]:
def rescale_first(x, y):   # rescales y such that it has the first element equal to x's one
    return y / y[0] * x[0]

In [6]:
def rescale_mod(x):   # rescales x such that its Euclidean norm is one (NaN components are ignored)
    n = len(x) - np.isnan(x).sum()
    s = math.sqrt(np.nansum(x * x))
    return x / s * math.sqrt(n)

In [7]:
def rescale_1norm(x):   # rescales x such that its l-1 norm is one 
    n = len(x) - np.isnan(x).sum()
    s = np.nansum(np.fabs(x))
    return x / s * n

Let's put the numbers in a matrix b where rows correspond to files and CPUs (mapped to numbers) to columns.

In [8]:
cmap = dict()   # mapping each CPU type to an integer starting from zero
i = -1
a = np.zeros((len(factors), n_files))   # a matrix with as many rows as CPU types and columns as files
for c in factors.keys():
    i += 1
    cmap[c] = i
    a[i] = [factors[c].setdefault(l, np.nan) for l in range(n_files)]   # a NaN is used if the CPU type is not present in file
b = a.transpose()   # rows are files and columns are speed factors

Let's use as "reference" a vector where all CPU factors are one.

In [9]:
ref = np.ones(len(factors))

Let's convert the speed factors according to different methods to make them "averageable".

In [10]:
b1 = np.empty_like(b)
b2 = np.empty_like(b)
b3 = np.empty_like(b)
b4 = np.empty_like(b)
b5 = np.empty_like(b)
for i in range(n_files):
    v = b[i]   # speed factors for file i
    b1[i] = rescale_dist(ref, v)   
    b2[i] = rescale_first(b[0], v)   # the first factor of the first file is used as reference (AV's suggestion)
    b3[i] = rescale_mod(v)
    b4[i] = rescale_dist(b[0], v)   # AS's initial idea
    b5[i] = rescale_1norm(v)
a1 = b1.transpose()
a2 = b2.transpose()
a3 = b3.transpose()
a4 = b4.transpose()
a5 = b5.transpose()

Let's visually inspect how the RMSes of the rescaled values for all files compare, for different rescaling methods.

In [12]:
for i in range(len(a)):
    print '%.4f %.4f %.4f %.4f %.4f %.4f' % (
        np.nanstd(a[i]) / np.nanmean(a[i]),
        np.nanstd(a1[i]) / np.nanmean(a1[i]),
        np.nanstd(a2[i]) / np.nanmean(a2[i]),
        np.nanstd(a3[i]) / np.nanmean(a3[i]),
        np.nanstd(a4[i]) / np.nanmean(a4[i]),
        np.nanstd(a5[i]) / np.nanmean(a5[i])
        )

0.0077 0.0069 0.0000 0.0079 0.0058 0.0090
0.0064 0.0052 0.0049 0.0063 0.0026 0.0075
0.0066 0.0077 0.0071 0.0083 0.0061 0.0091
0.0074 0.0079 0.0042 0.0088 0.0048 0.0099
0.0035 0.0044 0.0117 0.0040 0.0057 0.0040
0.0049 0.0064 0.0085 0.0067 0.0041 0.0073
0.0097 0.0103 0.0109 0.0110 0.0072 0.0118
0.0116 0.0105 0.0113 0.0113 0.0082 0.0123
0.0049 0.0055 0.0067 0.0062 0.0030 0.0072
0.0063 0.0060 0.0085 0.0062 0.0057 0.0066
0.0078 0.0095 0.0093 0.0090 0.0094 0.0086
0.0106 0.0115 0.0080 0.0115 0.0103 0.0117
0.0151 0.0164 0.0053 0.0165 0.0149 0.0167
0.0125 0.0110 0.0136 0.0118 0.0091 0.0128
0.0104 0.0102 0.0025 0.0111 0.0080 0.0121
0.0196 0.0199 0.0197 0.0200 0.0193 0.0202
0.0092 0.0087 0.0123 0.0091 0.0066 0.0098
0.0109 0.0104 0.0037 0.0108 0.0094 0.0113
0.0084 0.0082 0.0000 0.0084 0.0073 0.0087
0.0139 0.0141 0.0108 0.0145 0.0121 0.0151
0.0107 0.0121 0.0057 0.0128 0.0077 0.0136
0.0081 0.0091 0.0093 0.0094 0.0070 0.0099
0.0161 0.0135 0.0089 0.0143 0.0124 0.0152
0.0093 0.0098 0.0038 0.0100 0.0088

This shows that any rescaling method strongly reduces the RMS, which is consistent with the fact that the speed factors fitted on different values don't have a priori the same scale. Therefore, rescaling is absolutely necessary if we want to combine different measurements.

Another way to see this is to calculate the sum of all the normalised rms and compare the totals.

In [13]:
t = 0.
t1 = 0.
t2 = 0.
t3 = 0.
t4 = 0.
t5 = 0.

for i in range(len(a)):
    t += np.nanstd(a[i]) / np.nanmean(a[i])
    t1 += np.nanstd(a1[i]) / np.nanmean(a1[i])
    t2 += np.nanstd(a2[i]) / np.nanmean(a2[i])
    t3 += np.nanstd(a3[i]) / np.nanmean(a3[i])
    t4 += np.nanstd(a4[i]) / np.nanmean(a4[i])
    t5 += np.nanstd(a4[i]) / np.nanmean(a5[i])
print t, t1, t2, t3, t4, t5

0.789770530134 0.78962567895 nan 0.820041477796 0.70406326195 0.682472287992


These numbers show that all rescaling methods give similarly sized rms.

Finally, let's actually calculate the combined speed factors.

In [16]:
output_file = 'results_cputype_task_cpu_pile_0.001.csv'
with open(output_file, 'wb') as csvfile:
    w = csv.writer(csvfile, delimiter=',')
    for s in sorted(cmap.keys()):
        print '%.4f' % np.nanmean(a5[cmap[s]])
        w.writerow([s, np.nanmean(a5[cmap[s]])])

0.8489
0.6896
0.7781
0.6641
0.6792
0.6980
0.6634
0.5614
0.6513
0.7001
0.9648
0.7779
0.8099
0.8402
1.1666
0.9088
0.8789
1.0376
1.0779
1.1355
1.2053
0.9408
0.9012
0.7630
1.0987
0.9048
0.8968
0.9172
1.1782
1.0479
0.6636
1.0399
0.9646
0.7942
0.9659
1.2298
1.2756
1.1353
0.9866
0.9696
1.0092
1.3934
1.1014
0.8670
1.1570
0.8698
1.1622
0.8058
0.8994
0.6589
1.0147
1.2249
1.2349
0.9550
1.1271
1.0497
1.0792
0.9950
1.0470
0.9750
1.2564
1.0520
1.0713
1.0617
1.1303
1.0815
1.1596
1.2947
1.4621
1.7292
1.2354
1.0873
1.1409
0.7079
0.7795
0.9065
1.1696
