# Import Libs

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy.stats #for t value

# import Data & Create Dataframes

In [2]:
# Get logs-index
df_logIndex = pd.read_excel(r'./logs-index.xlsx', index_col=None, header=1)

# Remove empty rows and columns
df_logIndex.dropna(axis=1, how='all', inplace=True) #delete cols with NaN
df_logIndex.dropna(axis=0, how='all', inplace=True) #delete cols with NaN
#df_logIndex_B.drop('Unnamed: 21',axis=1, inplace=True)

# Split into batch dataframe
df_Batch = df_logIndex[df_logIndex['Mode']=='batch']   #filter for batch
df_Batch = df_Batch[df_logIndex['CPU']=='3000m']       #filter for 3000m cpu
df_Batch['Time_scaled']=pd.to_numeric(df_Batch['Time_scaled']) 
#df_Batch = df_Batch.assign(b0=1)                         #add intercept values = 1

# Split into simulated dataframe
df_Sim = df_logIndex[df_logIndex['Mode']=='simulated'] #filter for simulated
#df_Sim = df_Sim.assign(b0=1)                             #add intercept values = 1

  df_Batch = df_Batch[df_logIndex['CPU']=='3000m']       #filter for 3000m cpu


In [3]:
print(df_Batch)

   Purpose  Nodes    CPU Memory          Network  Epochs  Batch_size  \
1      NaN    1.0  3000m    2Gi  FashionMNISTCNN    80.0       128.0   
2      NaN    1.0  3000m    2Gi  FashionMNISTCNN    80.0       128.0   
3      NaN    1.0  3000m    2Gi  FashionMNISTCNN    80.0       128.0   
4      NaN    2.0  3000m    2Gi  FashionMNISTCNN    80.0       128.0   
5      NaN    2.0  3000m    2Gi  FashionMNISTCNN    80.0       128.0   
6      NaN    2.0  3000m    2Gi  FashionMNISTCNN    80.0       128.0   
7      NaN    4.0  3000m    2Gi  FashionMNISTCNN    80.0       128.0   
8      NaN    4.0  3000m    2Gi  FashionMNISTCNN    80.0       128.0   
9      NaN    4.0  3000m    2Gi  FashionMNISTCNN    80.0       128.0   
10     NaN    1.0  3000m    2Gi  FashionMNISTCNN    80.0        32.0   
11     NaN    1.0  3000m    2Gi  FashionMNISTCNN    80.0        32.0   
12     NaN    1.0  3000m    2Gi  FashionMNISTCNN    80.0        32.0   
13     NaN    2.0  3000m    2Gi  FashionMNISTCNN    80.0        

# Regression Model for Batch experiments

**ANOVA Analysis for Regression Model**

In [4]:
#link for Ordinary Least Square (OLS) Method for Linear Regression by hand: https://medium.com/analytics-vidhya/ordinary-least-square-ols-method-for-linear-regression-ef8ca10aadfc

# Create regression model
model1 = ols("""Time_scaled  ~ Nodes + Epochs + C(Batch_size) + 
               Nodes:Epochs + Nodes:C(Batch_size) + Epochs:C(Batch_size) + 
               Nodes:Epochs:C(Batch_size)""", data=df_Batch).fit() #C() --> treat as categorical variable

# Anova analysis of model               
anova_results = sm.stats.anova_lm(model1,tpy=3) #typ 2 assumes no significant interactions and is morepowerful than type 3 if this is true
# note: if the data is balanced (equal sample size for each group), Type 1, 2, and 3 sums of squares will produce similar results.

In [5]:
#dir(model1._results) # list of attributes of model results
dir(model1)
#model1.get_influence().summary_frame()

['HC0_se',
 'HC1_se',
 'HC2_se',
 'HC3_se',
 '_HCCM',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abat_diagonal',
 '_cache',
 '_data_attr',
 '_data_in_cache',
 '_get_robustcov_results',
 '_is_nested',
 '_use_t',
 '_wexog_singular_values',
 'aic',
 'bic',
 'bse',
 'centered_tss',
 'compare_f_test',
 'compare_lm_test',
 'compare_lr_test',
 'condition_number',
 'conf_int',
 'conf_int_el',
 'cov_HC0',
 'cov_HC1',
 'cov_HC2',
 'cov_HC3',
 'cov_kwds',
 'cov_params',
 'cov_type',
 'df_model',
 'df_resid',
 'eigenvals',
 'el_test',
 'ess',
 'f_pvalue',
 'f_test',
 'fittedvalues',
 'fvalue',
 'get_influence',
 'get_prediction',
 'get_robustcov_results',
 'info_criteria',


In [6]:
print("\n\n * Anova results: ")
print(anova_results)  



 * Anova results: 
                              df       sum_sq      mean_sq          F  \
C(Batch_size)                1.0   891.994915   891.994915  11.299584   
Nodes                        1.0  1315.557436  1315.557436  16.665175   
Nodes:C(Batch_size)          1.0     3.012153     3.012153   0.038157   
Epochs                       1.0  2684.843590  2684.843590  34.010973   
Epochs:C(Batch_size)         1.0    75.708299    75.708299   0.959055   
Nodes:Epochs                 1.0   301.777143   301.777143   3.822843   
Nodes:Epochs:C(Batch_size)   1.0    59.226887    59.226887   0.750272   
Residual                    28.0  2210.334294    78.940511        NaN   

                              PR(>F)  
C(Batch_size)               0.002255  
Nodes                       0.000337  
Nodes:C(Batch_size)         0.846538  
Epochs                      0.000003  
Epochs:C(Batch_size)        0.335811  
Nodes:Epochs                0.060607  
Nodes:Epochs:C(Batch_size)  0.393752  
Residual 

In [7]:
print("* Regression Model Summary: ") #link on how to interpret: https://www.geeksforgeeks.org/interpreting-the-results-of-linear-regression-using-ols-summary/
print(model1.summary()) 

print("\n\n * Regression Parameter Estimates: ") 
print(model1._results.params)

* Regression Model Summary: 
                            OLS Regression Results                            
Dep. Variable:            Time_scaled   R-squared:                       0.707
Model:                            OLS   Adj. R-squared:                  0.634
Method:                 Least Squares   F-statistic:                     9.649
Date:                Wed, 05 Apr 2023   Prob (F-statistic):           4.59e-06
Time:                        09:51:31   Log-Likelihood:                -125.19
No. Observations:                  36   AIC:                             266.4
Df Residuals:                      28   BIC:                             279.1
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------

**Goodness of Fit**

In [8]:
## Factor Analysis
print("\nFACTOR ANALYSIS:")
# Determine statistical significance of factors (F-test factors)
print("\n___SIGNIFICANCE & IMPORTANCE________________________________")
f_significance = 0.05
factor_P_val = anova_results['PR(>F)'].values
df_factor = pd.DataFrame(data=factor_P_val, index = anova_results.index, columns=['P_val']) # if P val < significance -> stat. sig.
df_factor['significance(0.05)']= factor_P_val<f_significance
SST =  sum(anova_results['sum_sq'].values)
df_factor['% variation']= np.round(100*anova_results['sum_sq'].values/SST)

print(df_factor.sort_values(by='% variation', ascending=False))

# Determine a definite difference in performance of level of factors (T-test factors)
# -- NB: estimate factor +- (t_value)(std of estimate factor)
# -- NB: estimate factor level = avg y for factor level - y tot avg

# -- Determine avg y and std of y for factor levels
# ---- variable declaration
nodes_levels = df_Batch.drop_duplicates(inplace=False, subset=['Nodes'])['Nodes']._values #[1,2,4]
epochs_levels = df_Batch.drop_duplicates(inplace=False, subset=['Epochs'])['Epochs']._values #[40,80]
batchsize_levels = df_Batch.drop_duplicates(inplace=False, subset=['Batch_size'])['Batch_size']._values #[32,128]
factor_se2 = anova_results['sum_sq'].values[-1] / anova_results['df'].values[-1]
factor_df_y = len(nodes_levels)*len(epochs_levels)*len(batchsize_levels)*3 # df_y = a*b*c*replications

# ---- calculations
y_avg_nodes = []
y_std_nodes = []
for i in nodes_levels:
  y_avg_nodes += [df_Batch[df_Batch['Nodes']==i]['Time_scaled'].mean()] #avg y for nodes = i
  y_std_nodes += [(factor_se2*(len(nodes_levels)-1))/factor_df_y]

y_avg_epochs = []
y_std_epochs = []
for i in epochs_levels:
  y_avg_epochs += [df_Batch[df_Batch['Epochs']==i]['Time_scaled'].mean()] #avg y for epochs = i
  y_std_epochs += [(factor_se2*(len(epochs_levels)-1))/factor_df_y]

y_avg_batchsize = []
y_std_batchsize = []
for i in batchsize_levels:
  y_avg_batchsize += [df_Batch[df_Batch['Batch_size']==i]['Time_scaled'].mean()] #avg y for epochs = i
  y_std_batchsize += [(factor_se2*(len(batchsize_levels)-1))/factor_df_y]

# -- Determine estimates for factor levels
y_est_nodes = y_avg_nodes - df_Batch['Time_scaled'].mean()
y_est_epochs = y_avg_epochs - df_Batch['Time_scaled'].mean()
y_est_batchsize = y_avg_batchsize - df_Batch['Time_scaled'].mean()

# -- Determine confidence interval 
t_significance = 0.10 # 90% confidence
df_error = anova_results['df'].values[-1]
t_value = scipy.stats.t.ppf(q=1-(t_significance/2),df=df_error) #two tailed 

CI_est_nodes_l = y_est_nodes - np.multiply(y_std_nodes,t_value)
CI_est_nodes_u = y_est_nodes + np.multiply(y_std_nodes,t_value)
print("\n___NODE CONFIDENCE INTERVAL________________________________")
print("Nodes levels order: ", nodes_levels)
print("Nodes CI lower bound: ", CI_est_nodes_l)
print("Nodes CI upper bound: ", CI_est_nodes_u)
print("Definite difference at "+str(100*(1-t_significance))+"% CI level: ",(CI_est_nodes_l*CI_est_nodes_u)>0) #if intervals contain 0 result negative

CI_est_epochs_l = y_est_epochs - np.multiply(y_std_epochs,t_value)
CI_est_epochs_u = y_est_epochs + np.multiply(y_std_epochs,t_value)
print("\n___EPOCHS CONFIDENCE INTERVAL______________________________")
print("Epochs levels order: ", epochs_levels)
print("Epochs CI lower bound: ", CI_est_epochs_l)
print("Epochs CI upper bound: ", CI_est_epochs_u)
print("Definite difference at "+str(100*(1-t_significance))+"% CI level: ",(CI_est_epochs_l*CI_est_epochs_u)>0) #if intervals contain 0 result negative

CI_est_batchsize_l = y_est_batchsize - np.multiply(y_std_batchsize,t_value)
CI_est_batchsize_u = y_est_batchsize + np.multiply(y_std_batchsize,t_value)
print("\n___BATCH SIZE CONFIDENCE INTERVAL__________________________")
print("Batch size levels order: ", batchsize_levels)
print("Batch size CI lower bound: ", CI_est_batchsize_l)
print("Batch size CI upper bound: ", CI_est_batchsize_u)
print("Definite difference at "+str(100*(1-t_significance))+"% CI level: ",(CI_est_batchsize_l*CI_est_batchsize_u)>0) #if intervals contain 0 result negative



FACTOR ANALYSIS:

___SIGNIFICANCE & IMPORTANCE________________________________
                               P_val  significance(0.05)  % variation
Epochs                      0.000003                True         36.0
Residual                         NaN               False         29.0
Nodes                       0.000337                True         17.0
C(Batch_size)               0.002255                True         12.0
Nodes:Epochs                0.060607               False          4.0
Epochs:C(Batch_size)        0.335811               False          1.0
Nodes:Epochs:C(Batch_size)  0.393752               False          1.0
Nodes:C(Batch_size)         0.846538               False          0.0

___NODE CONFIDENCE INTERVAL________________________________
Nodes levels order:  [1. 2. 4.]
Nodes CI lower bound:  [  2.27725842  -7.63708255 -14.77436922]
Nodes CI upper bound:  [17.19816335  7.28382239  0.14653571]
Definite difference at 90.0% CI level:  [ True False False]

___EPOCHS C

In [9]:
## Model Analysis 
print("\nMODEL ANALYSIS:")

# Determine statistical significance of regression model (F-test model)
# --MSR/MSE  (Null hypothesis is that y doesnt depend on any factor x)
f_significance = 0.05
print("\n___STATISTICAL SIGNIFICANCE________________________________")
print("Given significance of "+str(f_significance)+" the statistically significance of the model evaluates to: ", model1.f_pvalue<f_significance)# if P val < significance -> stat. sig.

# Coefficient of Determination
print("\n___COEFF OF DETERMINATION__________________________________")
print("Coefficient of Determination (R^2) = ",np.round(model1._results.rsquared*100),"%") # % of variation of y explained by regression (importance)

# Confidence interval of regression parameters (T-test factors)
# --estimate b +- (t_value)(std of estimate b)
t_significance = 0.10 # 90% confidence
df_error_model = model1.df_resid
t_value_model = scipy.stats.t.ppf(q=1-(t_significance/2),df=df_error_model) #two tailed 
model_se2 = model1.mse_resid
model_df_y = model1._results.nobs
model_std_param = (model_se2 * 1)/model_df_y #df of regression parameter is 1 

CI_b = pd.DataFrame(data=model1.params - (t_value_model * model_std_param), columns=['lower_bound'])
CI_b['upper_bound']= model1.params + (t_value_model * model_std_param )
CI_b["t_test_pass"] = (CI_b['lower_bound']*CI_b['upper_bound'])>0 #if intervals contain 0 result negative

print("\n___REGRESSION PARAM (b) CONFIDENCE INTERVAL________________")
print(CI_b)

# Confidence interval of predicted response
# --estimate y +- (t_value)(std of estimate y)
print("\n___PREDICTED RESPONSE CONFIDENCE INTERVAL__________________")
#print("Predicted value =", model1.predict()) 
#print(model1.get_prediction().summary_frame().mean_se)  #to get the std err (se) of predicted values




MODEL ANALYSIS:

___STATISTICAL SIGNIFICANCE________________________________
Given significance of 0.05 the statistically significance of the model evaluates to:  True

___COEFF OF DETERMINATION__________________________________
Coefficient of Determination (R^2) =  71.0 %

___REGRESSION PARAM (b) CONFIDENCE INTERVAL________________
                                     lower_bound  upper_bound  t_test_pass
Intercept                              -2.047398     5.413055        False
C(Batch_size)[T.128.0]                -17.870568   -10.410116         True
Nodes                                  -4.744230     2.716223        False
Nodes:C(Batch_size)[T.128.0]            1.645983     9.106436         True
Epochs                                 -3.062250     4.398202        False
Epochs:C(Batch_size)[T.128.0]          -3.587821     3.872632        False
Nodes:Epochs                           -3.799063     3.661390        False
Nodes:Epochs:C(Batch_size)[T.128.0]    -3.835751     3.624701   

In [10]:
#Based on regression example at end of slides 2
'''
#Estimating model parameters for [y] = [X][b] + [e]


TO DO: Calculate averages over replications

NB: Model doesnt calculate for interactions between factors

y = df_Batch[['Time_scaled']].to_numpy()
X = df_Batch[['b0','Nodes','Epochs','Batch_size']].to_numpy() 

C= np.dot(X.transpose(),X) #(X^T X)
#if determinant not 0
if np.linalg.det(C) != 0:
  C=np.lingalg.inv(C) 
else: 
  C = C#*0 #set to 0 to indicate invalid C val

D =  np.dot(X.transpose(),y)
b = np.dot(C, D)
'''

"\n#Estimating model parameters for [y] = [X][b] + [e]\n\n\nTO DO: Calculate averages over replications\n\nNB: Model doesnt calculate for interactions between factors\n\ny = df_Batch[['Time_scaled']].to_numpy()\nX = df_Batch[['b0','Nodes','Epochs','Batch_size']].to_numpy() \n\nC= np.dot(X.transpose(),X) #(X^T X)\n#if determinant not 0\nif np.linalg.det(C) != 0:\n  C=np.lingalg.inv(C) \nelse: \n  C = C#*0 #set to 0 to indicate invalid C val\n\nD =  np.dot(X.transpose(),y)\nb = np.dot(C, D)\n"