# collect all data and put them in a single dataframe

In [None]:
import os
import pandas as pd

results_path = "/home/boonstra/xin_results_24_3(2)"

data_frames = []
for filename in os.listdir(results_path):
    seed, extension = os.path.splitext(filename)
    if extension.lower() != ".csv":
        continue
    filepath = os.path.join(results_path, filename)
    df = pd.read_csv(filepath, header=0)
    df["seed"] = seed
    data_frames.append(df)

all_data = pd.concat(data_frames)

all_data.columns
from hofss import Task, TaskType, Factor, Scenario

hofs = Factor.parse_from_file("data/hofs_frequencies_and_multipliers.csv")
task_types = TaskType.parse_from_file("data/gtt_nhep_hofs.csv", hofs)
scenarios = Scenario.parse_from_file("data/scenarios.csv")
tasks = Task.parse_from_file("data/tasks.csv", task_types, scenarios)

typed_data = []
all_data["task_type"] = None
for task, group_data in all_data.groupby("task"):
    task_instance: Task = next(filter(lambda x: task == x.name, tasks), None)
    group_data["task_type"] = task_instance.task_type.name
    typed_data.append(group_data)
all_data = pd.concat(typed_data)
all_data.index = range(len(all_data))

In [None]:
import numpy as np
scenario_data = all_data[all_data["scenario"].notna()].copy()
previous_failure_probabilities = all_data.iloc[scenario_data.index - 1]["total"]
scenario_data["failure_probability_change"] = scenario_data["total"].values / previous_failure_probabilities.values
# scenario_data.dropna(inplace=True, subset=["failure_probability_change"])
scenario_data = scenario_data[~scenario_data["failure_probability_change"].isin([np.nan, np.inf, -np.inf])]


In [None]:
from matplotlib.axes._axes import Axes
import matplotlib.pyplot as plt

counter = 0
p = None
fig, axs = plt.subplots(len(scenario_data["task"].unique()), sharex=True)
scale_factor = 3
fig.set_size_inches(w=scale_factor*80/25.4, h=scale_factor*80/25.4)
ax: Axes
for (i, ax), (task, group_data) in zip(enumerate(axs), scenario_data.groupby("task")):
    log_values = [np.log10(value) for value in group_data["failure_probability_change"] if value >= 2e-7]
    ax.hist(log_values,linewidth=0.5, edgecolor="black", bins=np.arange(-3.5,5.0,0.1))
    ax.spines["top"].set_visible(False)
    ax.spines["left"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.set_ylabel(task, rotation="horizontal")
    ax.tick_params(
        "both", bottom=True, left=False, top=False, labelbottom=i == len(axs) - 1, labelleft=False
    )
    ax.axvline(0, color="black")

    # print(task, len(group_data), group_data["failure_probability_change"].mean(), group_data["failure_probability_change"].std())
    counter += 1
    axs[i] = ax
    # if counter == 3:
    #     break 

fig.tight_layout()
# list(p["failure_probability_change"].values)

In [None]:
def gmean(a):
    return np.prod(a)**(1/len(a))

def gmean_contributions(a: pd.Series):
    # remove nan_values
    b = a.dropna()

    contributors = []
    for index in b.index:
        contributors.append(gmean(b.values) / gmean(b.drop(index)))

    # return new series
    return pd.Series({k.replace("multiplier", "contribution"): v for k, v in zip(b.index, contributors)})

rows = []
multiplier_columns = [c for c in scenario_data.columns if "multiplier" in c]
count = 0
for row_index, row in scenario_data.iterrows():
    rows.append(pd.concat([row, gmean_contributions(row[multiplier_columns])]))
    count += 1

enriched_data = pd.concat(rows, axis=1).T
improvement_data = enriched_data[enriched_data["failure_probability_change"] <= 1.0]
degradation_data = enriched_data[enriched_data["failure_probability_change"] > 1.0]


In [None]:
task_type_order = [tt.name for tt in task_types]
factors = [f"F{i}" for i in range(1,15)]

plot_values = []
plot_labels = []
for task_type in task_type_order:
    task_type_values = []
    task_type_labels = []
    subset = enriched_data[enriched_data["task_type"] == task_type]
    probabilities = np.array(subset["failure_probability_change"].values, dtype=np.float64)

    for factor in factors:
        contributions = np.array(subset[f"{factor}_contribution"].values, dtype=np.float64)
        task_type_values.append(np.log10(np.mean(probabilities**contributions)))
        task_type_labels.append(np.mean(contributions))
    plot_values.append(task_type_values)
    plot_labels.append(task_type_labels)


In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

fig, ax = plt.subplots()
scale_factor = 3.0
fig.set_size_inches(w=80*scale_factor/25.4,h=60*scale_factor/25.4)
im = ax.imshow(plot_values,cmap=cm.get_cmap("OrRd"))

# Show all ticks and label them with the respective list entries
ax.set_xticks(np.arange(len(factors)), labels=[f.description for f in hofs])
ax.set_yticks(np.arange(len(task_type_order)), labels=[tt.description for tt in task_types])
ax.set_xlabel("Factor")
ax.set_ylabel("Task type")

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(len(task_type_order)):
    for j in range(len(factors)):
        if np.isnan(plot_values[i][j]):
            continue
        text = ax.text(j, i, round(plot_values[i][j], 1), ha="center", va="center")

ax.set_title("Effect on failure probability")
shift=0.05
fig.tight_layout(rect=(shift,0,1+shift,1))
fig.savefig("sensitivity.png")
plt.show()
