In [25]:
import numpy as np
import math
from scipy import stats
import pandas as pd
import statistics as st
import scikit_posthocs as sp

# Statistical analysis of TOC removal

First we import all relevant libraries. Then import our TOC removal data (TOC_removal.xlsx) the dataframe is displayed below. 

Since we have multiple groups with one independent variable (temperature) and want to compare them we do an one-way ANOVA like statistical analysis. Therefore before we do so we need to check if we violate the assumptions for doing the one-way ANOVA analysis. Assumptions are normality and equal variance. The analysis for determining if these assumptions hold are Shapiro-wilks and Levene's test respectivly. 

In [26]:
df1 = pd.read_excel('TOC_removal.xlsx')
display(df1) #Making dataframe from excel file

Unnamed: 0,SBWW_10,SBWW_15,SBWW_20,SBWW_25
0,1167.2,1374.3,1253.6,1300.6
1,1269.4,1370.94,1254.4,1276.4
2,1270.6,1397.02,1221.2,1328.2


In [27]:
SBWW_10 = df1['SBWW_10'].values.tolist() #Putting each column from dataframe into lists
SBWW_15 = df1['SBWW_15'].values.tolist() #Putting each column from dataframe into lists
SBWW_20 = df1['SBWW_20'].values.tolist() #Putting each column from dataframe into lists
SBWW_25 = df1['SBWW_25'].values.tolist() #Putting each column from dataframe into lists
data = [SBWW_10, SBWW_15, SBWW_20, SBWW_25]

In [28]:
stats.levene(SBWW_10, SBWW_15, SBWW_20, SBWW_25, center='median', proportiontocut=0.05)

LeveneResult(statistic=0.390721736461269, pvalue=0.7630699924093591)

In [29]:
print(stats.shapiro(SBWW_10),
stats.shapiro(SBWW_15),
stats.shapiro(SBWW_20),
stats.shapiro(SBWW_25))

ShapiroResult(statistic=0.75870281457901, pvalue=0.01930529810488224) ShapiroResult(statistic=0.844821572303772, pvalue=0.2266968935728073) ShapiroResult(statistic=0.7680623531341553, pvalue=0.04033590108156204) ShapiroResult(statistic=0.9985658526420593, pvalue=0.9276542663574219)


# Normality and equal variance

Since in the Levene's test we do not have any significant difference there is evidence for equal variance, therefore we proceed with a one-way ANOVA test. Since only 2 out 4 violate the normality requirement we still move forward with the one-way ANOVA.

In [30]:
stats.f_oneway(SBWW_10, SBWW_15, SBWW_20, SBWW_25)

F_onewayResult(statistic=11.374649420621454, pvalue=0.0029477328670775007)

# Since p<0.05 we have evidence for significant difference between TOC removal at the different temperatures

After doing our one-way ANOVA we can see that with a p-value of approx. 0.0029 there is evidence for not accepting $H_0$ - being that the TOC removal at temperatures 10, 15, 20, and 25 are not significantly different from each other. So therefore we proceed with a post hoc analysis.

In [32]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

df2 = pd.DataFrame({'score': [0.747726, 0.813197, 0.813965,
                                0.880397, 0.878245, 0.894952,
                               0.803075, 0.803587, 0.782319,
                               0.833184, 0.817681, 0.850865],
               'group': np.repeat(['SBWW_10', 'SBWW_15', 'SBWW_20', 'SBWW_25'], repeats=3)})
    

tukey = pairwise_tukeyhsd(endog=df2['score'],
                          groups=df2['group'],
                          alpha=0.05)

print(tukey)

 Multiple Comparison of Means - Tukey HSD, FWER=0.05  
 group1  group2 meandiff p-adj   lower   upper  reject
------------------------------------------------------
SBWW_10 SBWW_15   0.0929 0.0039  0.0351  0.1507   True
SBWW_10 SBWW_20   0.0047    0.9 -0.0531  0.0625  False
SBWW_10 SBWW_25   0.0423 0.1668 -0.0155     0.1  False
SBWW_15 SBWW_20  -0.0882 0.0053  -0.146 -0.0304   True
SBWW_15 SBWW_25  -0.0506 0.0874 -0.1084  0.0071  False
SBWW_20 SBWW_25   0.0376 0.2367 -0.0202  0.0953  False
------------------------------------------------------
