In [77]:
import pandas as pd
import seaborn as sns
sns.set(context="notebook", palette="Spectral", style = 'darkgrid' ,font_scale = 1.5, color_codes=True)
import warnings
warnings.filterwarnings('ignore')
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from scipy.special import ndtri
from scipy.stats import chi2
from scipy.stats import t
from scipy.stats import f
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.regression.linear_model import OLS
import random
from scipy.stats import shapiro
from scipy.stats import bartlett


### Reduced Model

In [2]:
df_reduced = pd.read_csv(r'../data/direct_transitivity/d_.csv')
df_reduced.head()

Unnamed: 0,Index,File,Nodes,Total Nodes,Butterflies,X-vars,C-vars,Total vars,Total constraints,Crossings,Opttime,Status,Nodes visited,Setup Time
0,0,north/g.10.72.graphml,10,12,0,8,7,15,18,0,0.001962,2,0,0.028
1,1,north/g.13.45.graphml,13,13,0,31,2,33,118,0,7.3e-05,2,0,0.004
2,2,north/g.10.11.graphml,10,10,0,15,0,15,40,0,6.7e-05,2,0,0.002
3,3,Rome-Lib/graficon11nodi/grafo233.11,11,18,0,15,12,27,36,0,5.5e-05,2,0,0.002
4,4,Rome-Lib/graficon12nodi/grafo2240.12,12,16,0,12,11,23,28,0,0.011561,2,1,0.001


In [3]:
cols_of_interest = ['Total vars','Total constraints','Total Nodes','Crossings','Opttime']
df_reduced_subset = df_reduced[cols_of_interest]
df_reduced_subset.rename(columns={'Total vars': 'Total_vars','Total constraints': 'Total_constrains','Total Nodes':'Total_nodes'}, inplace=True)
df_reduced_subset.head()


Unnamed: 0,Total_vars,Total_constrains,Total_nodes,Crossings,Opttime
0,15,18,12,0,0.001962
1,33,118,13,0,7.3e-05
2,15,40,10,0,6.7e-05
3,27,36,18,0,5.5e-05
4,23,28,16,0,0.011561


In [4]:
reduced_model = ols(formula=" Opttime ~ Total_vars + Total_constrains + Total_nodes + Crossings", data=df_reduced_subset).fit()

print(reduced_model.summary())

                            OLS Regression Results                            
Dep. Variable:                Opttime   R-squared:                       0.762
Model:                            OLS   Adj. R-squared:                  0.761
Method:                 Least Squares   F-statistic:                     917.4
Date:                Thu, 07 Dec 2023   Prob (F-statistic):               0.00
Time:                        13:29:17   Log-Likelihood:                -4580.9
No. Observations:                1150   AIC:                             9172.
Df Residuals:                    1145   BIC:                             9197.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.9571      1.023  

In [5]:
sse_r = reduced_model.ssr
df_r = reduced_model.df_resid
print(f"SSE(R) = {sse_r},\nDegrees of freedom = {df_r}")

SSE(R) = 194145.65564334817,
Degrees of freedom = 1145.0


### Full Model

In [6]:
df_full = pd.read_csv(r'../data/direct_transitivity_data.csv')
df_full.head()

Unnamed: 0,Index,File,Nodes,Total Nodes,Butterflies,X-vars,C-vars,Total vars,Total constraints,Crossings,...,config,check_1,check_2,check_3,check_4,check_5,check_6,check_7,check_8,check_9
0,0,north/g.10.72.graphml,10,12,0,8,7,15,20,0,...,d_235689,0,1,1,0,1,1,0,1,1
1,1,north/g.13.45.graphml,13,13,0,6,2,8,8,0,...,d_235689,0,1,1,0,1,1,0,1,1
2,2,north/g.10.11.graphml,10,10,0,15,0,15,40,0,...,d_235689,0,1,1,0,1,1,0,1,1
3,3,Rome-Lib/graficon11nodi/grafo233.11,11,18,0,15,12,27,36,0,...,d_235689,0,1,1,0,1,1,0,1,1
4,4,Rome-Lib/graficon12nodi/grafo2240.12,12,16,0,12,11,23,30,0,...,d_235689,0,1,1,0,1,1,0,1,1


In [7]:
cols_of_interest = ['File','Total vars','Total constraints','Total Nodes','Crossings', 'check_1', 'check_2', 'check_3', 'check_4',
 'check_5', 'check_6','check_7', 'check_8', 'check_9','Opttime']
df_full_subset = df_full[cols_of_interest]
df_full_subset.rename(columns={'Total vars': 'Total_vars','Total constraints': 'Total_constrains','Total Nodes':'Total_nodes'}, inplace=True)
df_full_subset.head()

Unnamed: 0,File,Total_vars,Total_constrains,Total_nodes,Crossings,check_1,check_2,check_3,check_4,check_5,check_6,check_7,check_8,check_9,Opttime
0,north/g.10.72.graphml,15,20,12,0,0,1,1,0,1,1,0,1,1,0.000556
1,north/g.13.45.graphml,8,8,13,0,0,1,1,0,1,1,0,1,1,0.000291
2,north/g.10.11.graphml,15,40,10,0,0,1,1,0,1,1,0,1,1,4e-05
3,Rome-Lib/graficon11nodi/grafo233.11,27,36,18,0,0,1,1,0,1,1,0,1,1,0.002283
4,Rome-Lib/graficon12nodi/grafo2240.12,23,30,16,0,0,1,1,0,1,1,0,1,1,0.000364


In [8]:
df_full_subset.groupby(['File','Total_nodes'])['check_1'].count()

File                                     Total_nodes
Rome-Lib/graficon100nodi/grafo10372.100  190            512
Rome-Lib/graficon100nodi/grafo10550.100  218            512
Rome-Lib/graficon100nodi/grafo10937.100  203            512
Rome-Lib/graficon100nodi/grafo11613.100  231            512
Rome-Lib/graficon10nodi/grafo1010.10     12             512
                                                       ... 
north/g.69.3.graphml                     69             512
north/g.69.5.graphml                     69             512
north/g.70.1.graphml                     173            512
north/g.73.8.graphml                     167            512
north/g.75.7.graphml                     110            512
Name: check_1, Length: 1150, dtype: int64

In [9]:
df_full_subset_mean = pd.DataFrame(df_full_subset.groupby(['File','Total_nodes']).mean().reset_index())
df_full_subset_mean.head()

Unnamed: 0,File,Total_nodes,Total_vars,Total_constrains,Crossings,check_1,check_2,check_3,check_4,check_5,check_6,check_7,check_8,check_9,Opttime
0,Rome-Lib/graficon100nodi/grafo10372.100,190,4684.5,21258.5,50.519531,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,42.888354
1,Rome-Lib/graficon100nodi/grafo10550.100,218,4740.75,20871.75,39.484375,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,38.828083
2,Rome-Lib/graficon100nodi/grafo10937.100,203,8550.0,54953.0,520.349609,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,60.147798
3,Rome-Lib/graficon100nodi/grafo11613.100,231,5979.0,31154.75,65.232422,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,42.766048
4,Rome-Lib/graficon10nodi/grafo1010.10,12,46.5,87.5,0.0,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.009616


In [10]:
' + '.join(list(df_full_subset_mean.columns))

'File + Total_nodes + Total_vars + Total_constrains + Crossings + check_1 + check_2 + check_3 + check_4 + check_5 + check_6 + check_7 + check_8 + check_9 + Opttime'

In [11]:
full_model = ols(formula="Opttime ~ Total_nodes + Total_vars + Total_constrains + Crossings + check_1 + check_2 + check_3 + check_4 + check_5 + check_6 + check_7 + check_8 + check_9", data=df_full_subset_mean).fit()

print(full_model.summary())

                            OLS Regression Results                            
Dep. Variable:                Opttime   R-squared:                       0.828
Model:                            OLS   Adj. R-squared:                  0.827
Method:                 Least Squares   F-statistic:                     1377.
Date:                Thu, 07 Dec 2023   Prob (F-statistic):               0.00
Time:                        13:29:18   Log-Likelihood:                -4285.8
No. Observations:                1150   AIC:                             8582.
Df Residuals:                    1145   BIC:                             8607.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.3561      0.247  

# Why are degrees of freedom only 4 when parameters are 15


In [12]:
sse_f = full_model.ssr
df_f = full_model.df_resid
print(f"SSE(F) = {sse_f},\nDegrees of freedom = {df_f}")

SSE(F) = 116216.34333350047,
Degrees of freedom = 1145.0


### F-test

In [13]:
alpha = 0.05
f_star = ((sse_r - sse_f)/(df_r - df_f))/(sse_f/df_f)
f_star

critical_value = f.ppf(1-alpha, df_r - df_f, df_f)

if (f_star <= critical_value):
    print(f'Conclude Null with f_star={f_star} and crtical value={critical_value}')
else:
    print(f'Conclude Alternate with f_star={f_star} and crtical value={critical_value}')

Conclude Alternate with f_star=inf and crtical value=nan


In [14]:
df_full_subset_mean.columns

Index(['File', 'Total_nodes', 'Total_vars', 'Total_constrains', 'Crossings',
       'check_1', 'check_2', 'check_3', 'check_4', 'check_5', 'check_6',
       'check_7', 'check_8', 'check_9', 'Opttime'],
      dtype='object')

# Trying Something NEW

In [15]:
graph_df = pd.read_csv(r'../data/graph_properties.csv')
graph_df['graph_id'] = graph_df.index + 1
graph_df.head()

Unnamed: 0,Index,File,Nodes,NumLayers,NumEdges,EdgeDensity,AvgNodesPerLayer,AvgEdgesPerLayer,graph_id
0,0,north/g.10.72.graphml,12,6,13,0.52,2.0,2.6,1
1,1,north/g.13.45.graphml,13,4,12,0.342857,3.25,4.0,2
2,2,north/g.10.11.graphml,10,5,14,1.0,2.0,3.5,3
3,3,Rome-Lib/graficon11nodi/grafo233.11,18,8,19,0.475,2.25,2.714286,4
4,4,Rome-Lib/graficon12nodi/grafo2240.12,16,7,17,0.485714,2.285714,2.833333,5


In [16]:
joined_df = df_full.merge(graph_df, on = 'File')
joined_df = joined_df[['graph_id','Total Nodes', 'Crossings','Opttime','Status','config', 'check_1', 'check_2',
       'check_3', 'check_4', 'check_5', 'check_6', 'check_7', 'check_8',
       'check_9' ]]

joined_df.head()

Unnamed: 0,graph_id,Total Nodes,Crossings,Opttime,Status,config,check_1,check_2,check_3,check_4,check_5,check_6,check_7,check_8,check_9
0,1,12,0,0.000556,2,d_235689,0,1,1,0,1,1,0,1,1
1,1,12,0,0.000984,2,d_158,1,0,0,0,1,0,0,1,0
2,1,12,0,0.000857,2,d_1567,1,0,0,0,1,1,1,0,0
3,1,12,0,0.000621,2,d_234568,0,1,1,1,1,1,0,1,0
4,1,12,0,0.000661,2,d_58,0,0,0,0,1,0,0,1,0


In [17]:
joined_df['config'].replace('d_','d_0', inplace=True)
joined_df[joined_df['config'] == 'd_0']

Unnamed: 0,graph_id,Total Nodes,Crossings,Opttime,Status,config,check_1,check_2,check_3,check_4,check_5,check_6,check_7,check_8,check_9
409,1,12,0,0.001962,2,d_0,0,0,0,0,0,0,0,0,0
921,2,13,0,0.000073,2,d_0,0,0,0,0,0,0,0,0,0
1433,3,10,0,0.000067,2,d_0,0,0,0,0,0,0,0,0,0
1945,4,18,0,0.000055,2,d_0,0,0,0,0,0,0,0,0,0
2457,5,16,0,0.011561,2,d_0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586649,1146,232,427,60.059385,9,d_0,0,0,0,0,0,0,0,0,0
587161,1147,231,660,60.106764,9,d_0,0,0,0,0,0,0,0,0,0
587673,1148,230,258,60.041819,9,d_0,0,0,0,0,0,0,0,0,0
588185,1149,231,196,60.041623,9,d_0,0,0,0,0,0,0,0,0,0


In [18]:
joined_df['combination']=joined_df['config'].str.split('_').str[1]

In [19]:
joined_df['combination'] = joined_df['combination'].astype('category')

joined_df['graph_id'].dtype

dtype('int64')

In [62]:
cols_of_interest = ['graph_id','Total Nodes','Crossings', 'combination','Status','Opttime']
main_df = joined_df[cols_of_interest]

main_df

Unnamed: 0,graph_id,Total Nodes,Crossings,combination,Status,Opttime
0,1,12,0,235689,2,0.000556
1,1,12,0,158,2,0.000984
2,1,12,0,1567,2,0.000857
3,1,12,0,234568,2,0.000621
4,1,12,0,58,2,0.000661
...,...,...,...,...,...,...
588795,1150,230,26,124679,2,14.263508
588796,1150,230,26,14589,2,25.730838
588797,1150,230,27,69,9,60.585124
588798,1150,230,26,169,2,14.713520


## Random Sampling for 100 Graphs 

### Single Factor Study using combination switches as a factor


In [22]:
np.random.seed(42)
random_graph = np.random.randint(1, 1150, 100)
random_graph

main_df_sample = main_df[main_df['graph_id'].isin(random_graph)]


In [23]:
main_df_sample = main_df_sample[['combination','Opttime']]
main_df_sample.sort_values(by='combination', inplace=True)
main_df_sample.head()

Unnamed: 0,combination,Opttime
425881,0,56.879137
548761,0,60.070095
331161,0,0.644449
17817,0,0.016357
205721,0,1.653053


In [24]:
main_df_sample['combination'].value_counts()

0        98
1        98
2479     98
24789    98
2478     98
         ..
13567    98
1356     98
135      98
1349     98
9        98
Name: combination, Length: 512, dtype: int64

In [25]:
model = ols('Opttime ~ C(combination)', data=main_df_sample).fit()

aov_table = sm.stats.anova_lm(model, typ=2)

In [26]:
print(f"aov_table:\n {aov_table}")

aov_table:
                       sum_sq       df         F         PR(>F)
C(combination)  1.062337e+06    511.0  3.022024  5.344504e-102
Residual        3.416529e+07  49664.0       NaN            NaN


In [27]:
f_star = aov_table['F'][0]
df_sstr = aov_table['df'][0]

df_sse = aov_table['df'][1]

print(f'F-statistic is {f_star}')



critical_value = critical_value = f.ppf(1-alpha,df_sstr , df_sse)

print(f"Critical value is {critical_value}")

if (f_star <= critical_value):
    print(f"We conclude null hypothesis, since f_star: {f_star} is less than equal to critical value: {critical_value}. Which means that Opttime is same for all combinations")
else:
    print(f"We conclude alternate hypothesis, since f_star: {f_star} is greater than critical value: {critical_value}. Which means that Opttime is not same in all combinations")    


F-statistic is 3.022023592575528
Critical value is 1.1056988352196992
We conclude alternate hypothesis, since f_star: 3.022023592575528 is greater than critical value: 1.1056988352196992. Which means that Opttime is not same in all combinations


## Random Sampling for 100 Graphs 

### Two Factor Study using combination switches and graph_id as a factor


In [38]:
np.random.seed(42)
random_graph = np.random.randint(1, 1150, 100)
random_graph

main_df_sample = main_df[main_df['graph_id'].isin(random_graph)]


In [41]:
main_df_sample[['combination']].value_counts()

combination
0              98
1              98
2479           98
24789          98
2478           98
               ..
13567          98
1356           98
135            98
1349           98
9              98
Length: 512, dtype: int64

In [42]:
main_df_sample = main_df_sample[['combination','graph_id','Opttime']]
main_df_sample['graph_id'] = main_df_sample['graph_id'].astype('category')
main_df_sample.sort_values(by=['combination', 'graph_id'], inplace=True)
main_df_sample.shape

(50176, 3)

In [45]:
# Performing two-way ANOVA 
model_2 = ols('Opttime ~ C(combination) + C(graph_id)', data=main_df_sample).fit() 


In [53]:

aov_table_2 = sm.stats.anova_lm(model_2, typ=2)
print(f"aov_table:\n {aov_table_2}")

aov_table:
                       sum_sq       df            F  PR(>F)
C(combination)  1.062337e+06    511.0    29.110671     0.0
C(graph_id)     3.062547e+07     97.0  4421.014267     0.0
Residual        3.539824e+06  49567.0          NaN     NaN
