- A: CPU
- B: Memory
- C: Network
- D: Epochs

|         | -               | +                  |
| ------- | :-------------: | :----------------: |
| CPU     | 750m            | 1500m              |
| Memory  | 1.5Gi           | 3Gi                |
| Network | FashionMNISTCNN | FashionMNISTResNet |
| Epochs  | 5               | 10                 |

$2^{4-1}$ Experimental Design

| Run    | I     | A     | B      |C     |D     |
| :----: | :---: | :---: | :----: |:---: |:---: |
| 1 (11) | +     | -     | -      | -    | -    |
| 2 (12) | +     | +     | -      | -    | +    |
| 3 (13) | +     | -     | +      | -    | +    |
| 4 (14) | +     | +     | +      | -    | -    |
| 5 (15) | +     | -     | -      | +    | +    |
| 6 (16) | +     | +     | -      | +    | -    |
| 7 (17) | +     | -     | +      | +    | -    |
| 8 (18) | +     | +     | +      | +    | +    |

$I=ABCD$, Resolution $IV$


In [577]:
from pathlib import Path
from datetime import datetime as dt
from collections import defaultdict
import numpy as np
import pandas as pd
import plotly.express as px
import dataframe_image as dfi
import plotly.graph_objects as go

import scipy.stats as stats
import matplotlib.pyplot as plt
from tbparse import SummaryReader
import researchpy as rp
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.graphics.factorplots import interaction_plot
from pyDOE2 import *

LOG_DIR = "/home/engineer/fltk-testbed/logging"
FACTORS = ["CPU", "Memory", "Network", "Epochs"]
LEVELS = [
    ["750m", "1500m"],
    ["1.5Gi", "3Gi"],
    ["FashionMNISTCNN", "FashionMNISTResNet"],
    ["5", "10"],
]

In [578]:
df_factor = pd.DataFrame(data=LEVELS, index=FACTORS, columns=["-", "+"])
dfi.export(
    df_factor,
    "./tables/factors.svg",
    table_conversion="matplotlib",
)

In [579]:
doe = fracfact("A B C AB AC BC ABC")
doe_design = doe.copy()
doe = doe[:, [0, 1, 2, -1]]
doe

array([[-1., -1., -1., -1.],
       [ 1., -1., -1.,  1.],
       [-1.,  1., -1.,  1.],
       [ 1.,  1., -1., -1.],
       [-1., -1.,  1.,  1.],
       [ 1., -1.,  1., -1.],
       [-1.,  1.,  1., -1.],
       [ 1.,  1.,  1.,  1.]])

In [580]:
df_sign = pd.DataFrame(
    doe_design.astype(int), columns=["A", "B", "C", "AB", "AC", "BC", "D"]
)
df_sign.index += 1
df_sign.index.name = "Run"
df_sign["I"] = np.array([1] * len(df_sign))
df_sign = df_sign[["A", "B", "C", "D", "AB", "AC", "BC", "I"]]
dfi.export(
    df_sign,
    "./tables/doe.svg",
    table_conversion="matplotlib",
)

# Prepare data

In [581]:
combinations = []
for run in doe:
    combination = []
    for i, level in enumerate(LEVELS):
        if run[i] == 1:
            combination.append(level[1])
        else:
            combination.append(level[0])
    combination += [0, 0]
    combinations.append(combination)

combinations = np.repeat(combinations, 4, axis=0)

In [582]:
accuracies = []
for i in range(1, 9):
    try:
        path = f"{LOG_DIR}/exp_1{i}/train_job_0"
        reader = SummaryReader(path, pivot=True)
        df = reader.scalars
        accuracies.append(df["accuracy per epoch"][-1:].values[0])
    except:
        print("Directory not found")

In [583]:
for i, accuracy in enumerate(accuracies):
    combinations[i * 4 : i * 4 + 4, -2] = np.array(accuracy)

In [584]:
logs = ["start.log", "stop.log"]
exps = {}

for i in range(1, 9):
    try:
        containers = defaultdict(dict)
        for log in logs:
            path = f"{LOG_DIR}/exp_1{i}/{log}"
            with open(path, "r") as f:
                while line := f.readline():
                    split = line.rstrip().split()
                    key = Path(log).stem
                    value = dt.strptime(f"{split[0]} {split[1]}", "%m-%d-%Y %H:%M:%S")
                    containers[split[2]] |= {key: value}
        exps[i] = containers
    except:
        print("Directory not found")

In [585]:
for exp in exps:
    containers = exps[exp]
    for c in containers:
        container = containers[c]
        assert container["start"] < container["stop"], f"{c} started after stopped"
        container["time"] = (container["stop"] - container["start"]).total_seconds()

In [586]:
times = []
for exp in exps:
    time = []
    containers = exps[exp]
    for c in containers:
        time.append(containers[c]["time"])
    times.append(time)

In [587]:
for i, time in enumerate(times):
    combinations[i * 4 : i * 4 + 4, -1] = np.array(time)

In [588]:
df = pd.DataFrame(combinations, columns=FACTORS + ["Accuracy", "Time"])
df["Accuracy"] = pd.to_numeric(df["Accuracy"])
df["Time"] = pd.to_numeric(df["Time"])
df

Unnamed: 0,CPU,Memory,Network,Epochs,Accuracy,Time
0,750m,1.5Gi,FashionMNISTCNN,5,89.279999,364.0
1,750m,1.5Gi,FashionMNISTCNN,5,88.739998,374.0
2,750m,1.5Gi,FashionMNISTCNN,5,89.220001,384.0
3,750m,1.5Gi,FashionMNISTCNN,5,88.82,355.0
4,1500m,1.5Gi,FashionMNISTCNN,10,89.220001,353.0
5,1500m,1.5Gi,FashionMNISTCNN,10,88.739998,350.0
6,1500m,1.5Gi,FashionMNISTCNN,10,89.120003,334.0
7,1500m,1.5Gi,FashionMNISTCNN,10,88.900002,355.0
8,750m,3Gi,FashionMNISTCNN,10,89.019997,621.0
9,750m,3Gi,FashionMNISTCNN,10,88.68,643.0


In [589]:
df_style = df.copy()
df_style.index += 1
dfi.export(
    df_style,
    "./tables/df.svg",
    table_conversion="matplotlib",
)

In [590]:
rp.summary_cont(df["Accuracy"])






Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value 'Accuracy' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.



Unnamed: 0,Variable,N,Mean,SD,SE,95% Conf.,Interval
0,Accuracy,32.0,88.4856,0.6354,0.1123,88.2565,88.7147


In [591]:
for metric in ["Accuracy", "Time"]:
    for factor in FACTORS:
        fig = px.box(
            df,
            x=FACTORS,
            y=metric,
            color=factor,
            labels={
                "value": "Factors",
            },
        )
        fig.write_image(f"./figures/{metric}_{factor}.svg")


# Accuracy

In [592]:
model = ols('Accuracy ~ C(CPU, Sum) + C(Memory, Sum) + C(Network, Sum) + C(Epochs, Sum) + C(CPU, Sum):C(Memory, Sum) + C(CPU, Sum):C(Network, Sum) + C(Memory, Sum):C(Network, Sum)', df).fit()

# Seeing if the overall model is significant
print(f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue}")

Overall model F( 7, 24) =  15.941, p = 1.2747196453407767e-07


In [593]:
model.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,88.4856,0.054,1647.468,0.000,88.375,88.596
"C(CPU, Sum)[S.1500m]",-0.0169,0.054,-0.314,0.756,-0.128,0.094
"C(Memory, Sum)[S.1.5Gi]",0.1131,0.054,2.106,0.046,0.002,0.224
"C(Network, Sum)[S.FashionMNISTCNN]",0.5381,0.054,10.019,0.000,0.427,0.649
"C(Epochs, Sum)[S.10]",0.0131,0.054,0.244,0.809,-0.098,0.124
"C(CPU, Sum)[S.1500m]:C(Memory, Sum)[S.1.5Gi]",0.0256,0.054,0.477,0.638,-0.085,0.136
"C(CPU, Sum)[S.1500m]:C(Network, Sum)[S.FashionMNISTCNN]",-0.0319,0.054,-0.593,0.558,-0.143,0.079
"C(Memory, Sum)[S.1.5Gi]:C(Network, Sum)[S.FashionMNISTCNN]",-0.1319,0.054,-2.455,0.022,-0.243,-0.021


In [594]:
sum_table = pd.read_html(model.summary().tables[1].as_html(), header=0, index_col=0)[0]
idx = sum_table.index
col = sum_table.columns
sum_table.rename(
    index={
        idx[1]: "CPU (1500m)",
        idx[2]: "Memory (1.5Gi)",
        idx[3]: "Network (FashionMNISTCNN)",
        idx[4]: "Epochs (10)",
        idx[5]: "CPU (1500m)*Memory (1.5Gi)",
        idx[6]: "CPU (1500m)*Network (FashionMNISTCNN)",
        idx[7]: "Memory (1.5Gi)*Network (FashionMNISTCNN)",
    },
    columns={
        col[0]: "Coefficient",
        col[1]: "Standard Error",
        col[2]: "t-Statistic",
        col[3]: "p-value",
        col[4]: "CI 0.025",
        col[5]: "CI 0.975",
    },
    inplace=True,
)
dfi.export(
    sum_table,
    "./tables/anova_summary_accuracy_1.svg",
    table_conversion="matplotlib",
    fontsize=4,
)


Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



In [595]:
res = sm.stats.anova_lm(model, typ=3)
res

Unnamed: 0,sum_sq,df,F,PR(>F)
Intercept,250550.588125,1.0,2714151.0,3.682942e-62
"C(CPU, Sum)",0.009113,1.0,0.09871652,0.7560903
"C(Memory, Sum)",0.409511,1.0,4.436133,0.0458293
"C(Network, Sum)",9.266514,1.0,100.3818,4.735911e-10
"C(Epochs, Sum)",0.005513,1.0,0.05971933,0.8090173
"C(CPU, Sum):C(Memory, Sum)",0.021013,1.0,0.2276271,0.6376048
"C(CPU, Sum):C(Network, Sum)",0.032512,1.0,0.352192,0.558427
"C(Memory, Sum):C(Network, Sum)",0.556513,1.0,6.028562,0.02170205
Residual,2.215505,24.0,,


In [596]:
res_style = res.copy()
idx = res_style.index
col = res_style.columns
res_style.rename(
    index={
        idx[1]: "A",
        idx[2]: "B",
        idx[3]: "C",
        idx[4]: "D",
        idx[5]: "AB",
        idx[6]: "AC",
        idx[7]: "BC",
    },
    columns={
        col[0]: "Sum of Squares",
        col[1]: "Degrees of Freedom",
        col[2]: "F-test",
        col[3]: "p-value",
    },
    inplace=True,
)
dfi.export(
    res_style,
    "./tables/anova_accuracy_1.svg",
    table_conversion="matplotlib",
)

In [597]:
qq = stats.probplot(model.resid, rvalue=True)
x = np.array([qq[0][0][0], qq[0][0][-1]])

fig = go.Figure()
fig.add_scatter(x=qq[0][0], y=qq[0][1], mode="markers")
fig.add_scatter(x=x, y=qq[1][1] + qq[1][0] * x, mode="lines")
fig.update_layout(
    xaxis_title="Theoretical Quantiles",
    yaxis_title="Ordered Values",
    showlegend=False,
)
fig.show()
fig.write_image(f"./figures/qq_accuracy.svg")

In [598]:
fig = px.scatter(x=model.fittedvalues, y=model.resid)
fig.add_hline(
    y=0,
    line_dash="dash",
    line_color="red",
)
fig.update_traces(
    marker=dict(size=12, line=dict(width=1, color="DarkSlateGrey")),
)
fig.update_layout(
    xaxis_title="Predicted",
    yaxis_title="Residuals",
)
fig.show()
fig.write_image(f"./figures/Residuals_Predicted_accuracy.svg")

In [599]:
fig = px.scatter(x=range(1, 33), y=model.resid)
fig.add_hline(
    y=0,
    line_dash="dash",
    line_color="red",
)
fig.update_traces(
    marker=dict(size=12, line=dict(width=1, color="DarkSlateGrey")),
)
fig.update_layout(
    xaxis_title="Run",
    yaxis_title="Residuals",
)
fig.show()
fig.write_image(f"./figures/run_residuals_accuracy.svg")

# Time

In [600]:
model = ols('Time ~ C(CPU, Sum) + C(Memory, Sum) + C(Network, Sum) +C(Epochs, Sum) + C(CPU, Sum):C(Memory, Sum) + C(CPU, Sum):C(Network, Sum) + C(Memory, Sum):C(Network, Sum)', df).fit()

# Seeing if the overall model is significant
print(f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue}")

Overall model F( 7, 24) =  96960.140, p = 8.034193178380353e-52


In [601]:
stats.shapiro(model.resid)

ShapiroResult(statistic=0.9599804878234863, pvalue=0.2743379771709442)

In [602]:
qq = stats.probplot(model.resid, rvalue=True)
x = np.array([qq[0][0][0], qq[0][0][-1]])

fig = go.Figure()
fig.add_scatter(x=qq[0][0], y=qq[0][1], mode="markers")
fig.add_scatter(x=x, y=qq[1][1] + qq[1][0] * x, mode="lines")
fig.update_layout(
    xaxis_title="Theoretical Quantiles",
    yaxis_title="Ordered Values",
    showlegend=False,
)
fig.show()
fig.write_image(f"./figures/qq_time.svg")

In [603]:
model.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2306.1875,3.144,733.422,0.000,2299.698,2312.677
"C(CPU, Sum)[S.1500m]",-883.3750,3.144,-280.934,0.000,-889.865,-876.885
"C(Memory, Sum)[S.1.5Gi]",369.0000,3.144,117.351,0.000,362.510,375.490
"C(Network, Sum)[S.FashionMNISTCNN]",-1927.5625,3.144,-613.011,0.000,-1934.052,-1921.073
"C(Epochs, Sum)[S.10]",873.0625,3.144,277.655,0.000,866.573,879.552
"C(CPU, Sum)[S.1500m]:C(Memory, Sum)[S.1.5Gi]",-763.4375,3.144,-242.791,0.000,-769.927,-756.948
"C(CPU, Sum)[S.1500m]:C(Network, Sum)[S.FashionMNISTCNN]",763.1250,3.144,242.692,0.000,756.635,769.615
"C(Memory, Sum)[S.1.5Gi]:C(Network, Sum)[S.FashionMNISTCNN]",-389.0000,3.144,-123.711,0.000,-395.490,-382.510


In [604]:
sum_table = pd.read_html(model.summary().tables[1].as_html(), header=0, index_col=0)[0]
idx = sum_table.index
col = sum_table.columns
sum_table.rename(
    index={
        idx[1]: "CPU (1500m)",
        idx[2]: "Memory (1.5Gi)",
        idx[3]: "Network (FashionMNISTCNN)",
        idx[4]: "Epochs (10)",
        idx[5]: "CPU (1500m)*Memory (1.5Gi)",
        idx[6]: "CPU (1500m)*Network (FashionMNISTCNN)",
        idx[7]: "Memory (1.5Gi)*Network (FashionMNISTCNN)",
    },
    columns={
        col[0]: "Coefficient",
        col[1]: "Standard Error",
        col[2]: "t-Statistic",
        col[3]: "p-value",
        col[4]: "CI 0.025",
        col[5]: "CI 0.975",
    },
    inplace=True,
)
dfi.export(
    sum_table,
    "./tables/anova_summary_time_1.svg",
    table_conversion="matplotlib",
    fontsize=4,
)


Passing literal html to 'read_html' is deprecated and will be removed in a future version. To read from a literal string, wrap it in a 'StringIO' object.



In [605]:
fig = px.scatter(x=model.fittedvalues, y=model.resid)
fig.add_hline(
    y=0,
    line_dash="dash",
    line_color="red",
)
fig.update_traces(
    marker=dict(size=12, line=dict(width=1, color="DarkSlateGrey")),
)
fig.update_layout(
    xaxis_title="Predicted",
    yaxis_title="Residuals",
)
fig.show()
fig.write_image(f"./figures/Residuals_Predicted_time.svg")

In [606]:
fig = px.scatter(x=range(1, 33), y=model.resid)
fig.add_hline(
    y=0,
    line_dash="dash",
    line_color="red",
)
fig.update_traces(
    marker=dict(size=12, line=dict(width=1, color="DarkSlateGrey")),
)
fig.update_layout(
    xaxis_title="Run",
    yaxis_title="Residuals",
)
fig.show()
fig.write_image(f"./figures/run_residuals_time.svg")

In [607]:
res = sm.stats.anova_lm(model, typ=3)
res

Unnamed: 0,sum_sq,df,F,PR(>F)
Intercept,170192000.0,1.0,537908.553763,1.002586e-53
"C(CPU, Sum)",24971240.0,1.0,78924.062422,1.0041760000000001e-43
"C(Memory, Sum)",4357152.0,1.0,13771.205373,1.240103e-34
"C(Network, Sum)",118895900.0,1.0,375782.161454,7.418102000000001e-52
"C(Epochs, Sum)",24391620.0,1.0,77092.102851,1.3309700000000001e-43
"C(CPU, Sum):C(Memory, Sum)",18650780.0,1.0,58947.609798,3.328224e-42
"C(CPU, Sum):C(Network, Sum)",18635510.0,1.0,58899.361296,3.361075e-42
"C(Memory, Sum):C(Network, Sum)",4842272.0,1.0,15304.474616,3.50085e-35
Residual,7593.5,24.0,,


In [608]:
res_style = res.copy()
idx = res_style.index
col = res_style.columns
res_style.rename(
    index={
        idx[1]: "A",
        idx[2]: "B",
        idx[3]: "C",
        idx[4]: "D",
        idx[5]: "AB",
        idx[6]: "AC",
        idx[7]: "BC",
    },
    columns={
        col[0]: "Sum of Squares",
        col[1]: "Degrees of Freedom",
        col[2]: "F-test",
        col[3]: "p-value",
    },
    inplace=True,
)
dfi.export(
    res_style,
    "./tables/anova_time_1.svg",
    table_conversion="matplotlib",
)

# Accuracy vs Time

In [609]:
df_mean = df.groupby(FACTORS).mean()
dfi.export(
    df_mean,
    "./tables/accuracy_time.svg",
    table_conversion="matplotlib",
    fontsize=4,
)

In [610]:
fig = px.scatter(df_mean, x="Time", y="Accuracy", color="Accuracy")
fig.update_traces(
    marker=dict(size=12, line=dict(width=1, color="DarkSlateGrey")),
)
fig.show()
fig.write_image(f"./figures/accuracy_time.svg")