In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path


def bootstrap_for_CI(data_X, data_Y, hpa_X, hpa_Y, adjust_X, adjust_Y, msg_count, flow, metric):
    alpha = 0.05
    repetitions = 10000

    data_X_adjusted = []
    data_Y_adjusted = []
    if metric == 'billing':
        pods = 6 if flow == 'total' else 5
        for idx, x in enumerate(data_X):
            time_adjustment = adjust_X[idx] - adjust_Y[idx]
            if (time_adjustment > 0):
                #adjust Y
                data_X_adjusted.append(data_X[idx])
                pods_to_adj = pods + 1 if hpa_Y.startswith('MAHA') else pods
                data_Y_adjusted.append(round(data_Y[idx] + pods_to_adj * time_adjustment, 2))
            else:
                #adjust X
                data_Y_adjusted.append(data_Y[idx])
                pods_to_adj = pods + 1 if hpa_X.startswith('MAHA') else pods
                data_X_adjusted.append(round(data_X[idx] + pods_to_adj * time_adjustment, 2))
    else:
        data_X_adjusted = data_X
        data_Y_adjusted = data_Y

    print('Raw data:')
    print(data_X)
    print(data_Y)
    print('Adjusted data:')
    print(data_X_adjusted)
    print(data_Y_adjusted)

    df_X = pd.DataFrame({'data': data_X_adjusted, 'rand_var': 'X'})
    df_Y = pd.DataFrame({'data': data_Y_adjusted, 'rand_var': 'Y'})
    df = pd.concat([df_X, df_Y], ignore_index=True)
    sample_size = len(df)

    mean_diffs = []
    mean_pcs = []
    for i in range(repetitions):
        bootstrap_sample = df.sample(n=sample_size, replace=True)
        group_mean = bootstrap_sample.groupby('rand_var').mean()
        mean_diff = group_mean.iloc[1, 0] - group_mean.iloc[0, 0]
        mean_percent = (group_mean.iloc[1, 0] / group_mean.iloc[0, 0]) - 1
        mean_diffs.append(mean_diff)
        mean_pcs.append(mean_percent)

    mean_of_mean_diffs = round((sum(mean_diffs) / len(mean_diffs)), 2)
    left_ci = round(np.percentile(mean_diffs, alpha / 2 * 100), 2)
    right_ci = round(np.percentile(mean_diffs, 100 - alpha / 2 * 100), 2)
    mean_of_mean_pcs = round((sum(mean_pcs) * 100 / len(mean_pcs)), 2)
    left_ci_pc = round(np.percentile(mean_pcs, alpha / 2 * 100) * 100, 2)
    right_ci_pc = round(np.percentile(mean_pcs, 100 - alpha / 2 * 100) * 100, 2)

    mean_of_mean_diffs_print = 'Mean of mean differences:' + str(mean_of_mean_diffs) + ' (' + str(
        mean_of_mean_pcs) + '%)'
    ci_print = str((1 - alpha) * 100) + '% confidence interval for the mean differences: (' + str(left_ci) + ', ' + str(
        right_ci) + ') ' + '(' + str(left_ci_pc) + '%, ' + str(right_ci_pc) + '%)'
    print(mean_of_mean_diffs_print)
    print(ci_print)
    folder = 'mean_diff_' + str(msg_count) + '_msg_' + flow + '_flow_' + metric + '_metric/'
    file_name = ('mean_diff_' + str(
        msg_count) + '_msg_' + flow + '_flow_' + metric + '_metric_' + hpa_X + '_vs_' + hpa_Y).replace(" ", "_")
    Path(folder).mkdir(parents=True, exist_ok=True)
    with open(folder + file_name + '.txt', 'w') as f:
        f.write('Statistics for ' + file_name + '\n')
        f.write(mean_of_mean_diffs_print + '\n')
        f.write(ci_print + '\n')
        if metric == 'billing':
            f.write('\nRaw billing data:' + '\n')
            f.write(str(data_X) + '\n')
            f.write(str(data_Y) + '\n')
            f.write('Adjusted billing data used for comparison:' + '\n')
            f.write(str(data_X_adjusted) + '\n')
            f.write(str(data_Y_adjusted) + '\n')

    my_dpi = 96
    bins = (int(repetitions / 10))
    plt.figure(figsize=(1200 / my_dpi, 800 / my_dpi), dpi=my_dpi)
    plt.xlabel('Mean difference')
    plt.ylabel('Number of mean differences in bin')
    plt.hist(mean_diffs, color=['black'], bins=bins, label='Mean differences')
    plt.axvline(x=left_ci, color='red', linewidth=3.0, linestyle='-', label='Left confidence interval')
    plt.axvline(x=right_ci, color='red', linewidth=3.0, linestyle='-', label='Right confidence interval')
    plt.axvline(x=mean_of_mean_diffs, color='green', linewidth=3.0, linestyle='-', label='Mean of mean differences')
    plt.legend(loc='upper right')
    plt.savefig(folder + file_name + '.jpg')
    plt.show()


data_0_str_adjust = '388.47	389.72	389.28	389.03	387.66	388.18	389.76	389.79	386.70	386.81'
data_1_str_adjust = '217.75	235.91	205.53	296.66	233.10	257.40	234.12	231.94	231.01	281.55'
data_2_str_adjust = '308.89	314.48	309.61	306.03	313.82	307.35	312.97	310.67	314.01	308.47'
data_3_str_adjust = '157.06	177.17	155.64	160.24	163.79	168.74	171.96	166.69	176.01	165.05'

data_0_str = '1414.17	1417.25	1416.02	1417.02	1412.20	1418.15	1418.48	1420.46	1411.33	1413.13'
data_1_str = '1202.12	1160.10	1423.04	1257.38	1317.96	1314.94	1421.14	1422.13	1378.88	1206.63'
data_2_str = '352.03	356.35	351.36	341.78	337.37	345.49	341.31	333.40	359.39	343.02'
data_3_str = '172.84	177.70	157.49	171.29	177.18	169.25	172.90	172.71	176.52	167.52'

data_0 = [float(num_str) for num_str in data_0_str.split()]
data_1 = [float(num_str) for num_str in data_1_str.split()]
data_2 = [float(num_str) for num_str in data_2_str.split()]
data_3 = [float(num_str) for num_str in data_3_str.split()]

data_0_adjust = [float(num_str) for num_str in data_0_str_adjust.split()]
data_1_adjust = [float(num_str) for num_str in data_1_str_adjust.split()]
data_2_adjust = [float(num_str) for num_str in data_2_str_adjust.split()]
data_3_adjust = [float(num_str) for num_str in data_3_str_adjust.split()]

hpa_0 = 'no scaling'
hpa_1 = 'CPU HPA'
hpa_2 = 'MAHA without follow-up'
hpa_3 = 'MAHA with follow-up'

msg_count = 750
#flow='business'
flow = 'total'
metric = 'processing'
#metric='billing'

bootstrap_for_CI(data_0, data_1, hpa_0, hpa_1, data_0_adjust, data_1_adjust, msg_count, flow, metric)
bootstrap_for_CI(data_0, data_2, hpa_0, hpa_2, data_0_adjust, data_2_adjust, msg_count, flow, metric)
bootstrap_for_CI(data_0, data_3, hpa_0, hpa_3, data_0_adjust, data_3_adjust, msg_count, flow, metric)
bootstrap_for_CI(data_1, data_2, hpa_1, hpa_2, data_1_adjust, data_2_adjust, msg_count, flow, metric)
bootstrap_for_CI(data_1, data_3, hpa_1, hpa_3, data_1_adjust, data_3_adjust, msg_count, flow, metric)
bootstrap_for_CI(data_2, data_3, hpa_2, hpa_3, data_2_adjust, data_3_adjust, msg_count, flow, metric)
