# Combine results from different fits

Having fitted a set of speed factors from different but uniform samples, and knowing that the absolute scale is not determined, before comparing values from different samples it is necessary to rescale them to the same scale. My idea is to minimise the modulus of the difference between the first vector and the N-th vector multiplied by a factor, and rescale it accordingly.

In [1]:
import re
import math
import copy
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
import pandas as pd
from scipy.optimize import curve_fit

from __future__ import division

%matplotlib inline

In [2]:
n_files = 10
fact = dict()
factors = defaultdict(dict)   # Dict containing all k_factors fitted for a given CPU, indexed by cpu type and file number
factors_err = defaultdict(dict)   # Dict containing all k_factors fitted for a given CPU, indexed by cpu type and file number
for i in range(n_files):
    input_pattern = 'x_results_cputype_task2_cpu_sub_%s_evgen_0.001.csv'
    f = open(input_pattern % (str(i)))
    for line in f:
        cpu, k, k_err = line.strip().split(',')
        k = float(k)
        factors[cpu][i] = k
        factors_err[cpu][i] = k_err
    f.close()

In [3]:
def dot(x, y):   # a special inner product that skips components where one of the vectors has a NaN
    result = 0.
    for i in range(len(x)):
        if not np.isnan(x[i]) and not np.isnan(y[i]):
            result += x[i] * y[i]
    return result

In [4]:
def rescale_dist(x, y):   # rescales y such that the distance to x is minimised
    y2 = y * x / x   # to put nan in elements where x has nan
    alpha = np.nansum(x * y) / np.nansum(y2 * y2)
    return y * alpha

In [5]:
def rescale_first(x, y):   # rescales y such that it has the first element equal to x's one
    return y / y[0] * x[0]

In [13]:
def rescale_mod(x):   # rescales x such that its modulus is one 
    n = len(x) - np.isnan(x).sum()
    s = math.sqrt(np.nansum(x * x))
    return x / s * math.sqrt(n)

In [42]:
def rescale_1norm(x):   # rescales x such that its modulus is one 
    n = len(x) - np.isnan(x).sum()
    s = np.nansum(np.fabs(x))
    return x / s * n

In [7]:
cmap = dict()   # mapping each CPU type to an integer starting from zero
i = -1
a = np.zeros((len(factors), n_files))   # a matrix with as many rows as CPU types and columns as files
for c in factors.keys():
    i += 1
    cmap[c] = i
    a[i] = [factors[c].setdefault(l, np.nan) for l in range(n_files)]   # a NaN is used if the CPU type is not present in file
b = a.transpose()   # rows are files and columns are speed factors

In [8]:
ref = np.ones(len(factors))

In [43]:
b1 = np.empty_like(b)
b2 = np.empty_like(b)
b3 = np.empty_like(b)
b4 = np.empty_like(b)
b5 = np.empty_like(b)
for i in range(n_files):
    v = b[i]
    b1[i] = rescale_dist(ref, v)
    b2[i] = rescale_first(b[0], v)
    b3[i] = rescale_mod(v)
    b4[i] = rescale_dist(b[0], v)
    b5[i] = rescale_1norm(v)
a1 = b1.transpose()
a2 = b2.transpose()
a3 = b3.transpose()
a4 = b4.transpose()
a5 = b5.transpose()

In [45]:
for i in range(len(a)):
    print '%.3f %.3f %.3f %.3f %.3f %.3f' % (
        np.nanstd(a[i]) / np.nanmean(a[i]),
        np.nanstd(a1[i]) / np.nanmean(a1[i]),
        np.nanstd(a2[i]) / np.nanmean(a2[i]),
        np.nanstd(a3[i]) / np.nanmean(a3[i]),
        np.nanstd(a4[i]) / np.nanmean(a4[i]),
        np.nanstd(a5[i]) / np.nanmean(a5[i])
        )

0.047 0.011 0.000 0.012 0.013 0.014
0.063 0.014 0.016 0.015 0.009 0.015
0.048 0.012 0.015 0.013 0.010 0.015
0.047 0.008 0.012 0.009 0.006 0.009
0.051 0.009 0.016 0.008 0.011 0.008
0.048 0.011 0.016 0.010 0.015 0.011
0.040 0.015 0.018 0.015 0.015 0.016
0.043 0.008 0.013 0.009 0.007 0.011
0.017 0.023 0.024 0.024 0.019 0.024
0.045 0.013 0.020 0.013 0.018 0.013
0.049 0.019 0.027 0.019 0.016 0.020
0.047 0.009 0.017 0.010 0.006 0.011
0.052 0.026 0.024 0.027 0.022 0.029
0.047 0.011 0.015 0.012 0.007 0.014
0.045 0.008 0.012 0.008 0.006 0.009
0.034 0.025 0.029 0.025 0.024 0.026
0.050 0.012 0.018 0.012 0.010 0.012
0.049 0.011 0.018 0.012 0.006 0.013
0.048 0.010 0.015 0.011 0.005 0.012
0.065 0.036 0.036 0.037 0.031 0.038
0.052 0.014 0.020 0.014 0.010 0.015
0.045 0.007 0.013 0.008 0.004 0.009
0.045 0.008 0.012 0.009 0.004 0.010
0.055 0.030 0.042 0.029 0.028 0.029
0.046 0.015 0.018 0.016 0.011 0.017
0.044 0.008 0.012 0.009 0.006 0.010
0.048 0.009 0.014 0.010 0.013 0.011
0.043 0.011 0.017 0.010 0.01

In [47]:
t = 0.
t1 = 0.
t2 = 0.
t3 = 0.
t4 = 0.
t5 = 0.

for i in range(len(a)):
    t += np.nanstd(a[i]) / np.nanmean(a[i])
    t1 += np.nanstd(a1[i]) / np.nanmean(a1[i])
    t2 += np.nanstd(a2[i]) / np.nanmean(a2[i])
    t3 += np.nanstd(a3[i]) / np.nanmean(a3[i])
    t4 += np.nanstd(a4[i]) / np.nanmean(a4[i])
    t5 += np.nanstd(a4[i]) / np.nanmean(a5[i])
print t, t1, t2, t3, t4, t5

4.49182711782 1.66331273082 2.01377079718 1.68858973416 1.55128907926 1.42634450258
