<a href="https://colab.research.google.com/github/stephaniePocci/ANOVA-Post-Hoc-Comparison/blob/main/Two_Way_ANOVA_%26_Post_Hoc_Comparison_Tests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Two-Way ANOVA Test

In [None]:
import statsmodels.formula.api as sm
import statsmodels.api as sm_api

# Example data (replace with your actual data)
# Make sure your data is in a Pandas DataFrame
# and the columns are named appropriately.
import pandas as pd
data = {'Method': ['Amazon', 'Amazon', 'Amazon', 'Amazon', 'Amazon', 'Amazon', 'Amazon', 'Amazon', 'Amazon', 'Amazon', 'Target', 'Target', 'Target', 'Target', 'Target', 'Target', 'Target', 'Target', 'Target', 'Target'],
        'AgeGroup': ['Younger', 'Younger', 'Older', 'Younger', 'Older', 'Younger', 'Older', 'Younger', 'Older', 'Younger', 'Younger', 'Younger', 'Older', 'Younger', 'Older', 'Younger', 'Older', 'Younger', 'Older', 'Younger'],
        'Score': [7, 11, 20, 12, 10, 20, 22, 12, 13, 12, 7, 12, 28, 15, 24, 35, 31, 21, 18, 20]}
df = pd.DataFrame(data)


# Perform the two-way ANOVA test
model = sm.ols('Score ~ C(Method) + C(AgeGroup) + C(Method):C(AgeGroup)', data=df).fit()
anova_table = sm_api.stats.anova_lm(model, typ=2)

# Mean Aquare
anova_table['mean_sq'] = anova_table['sum_sq'] / anova_table['df']

# sum_sq - sum of squares to measure deviation of data from mean
# df - degrees of freedom to represent independent pieces of info to estimate parameter
# F - F-Value to assess vriance between group means (null hypothesis)
# mean_sq - mean square to represent average variance within each group
# lambda - measures the effect size
# PR(>F) - p-value indicates the probability of observating same data (<0.05 we reject null hypothesis)
print(anova_table)


                           sum_sq    df         F    PR(>F)     mean_sq
C(Method)              259.200000   1.0  5.543150  0.031660  259.200000
C(AgeGroup)            140.833333   1.0  3.011807  0.101876  140.833333
C(Method):C(AgeGroup)   10.800000   1.0  0.230965  0.637315   10.800000
Residual               748.166667  16.0       NaN       NaN   46.760417


In [None]:
# Calculate lambda for each factor
def calculate_lambda(anova_table, factor):
    """Calculates lambda (effect size) for a given factor.
    """
    ss_factor = anova_table.loc[factor, 'sum_sq']
    ss_total = anova_table['sum_sq'].sum()
    lambda_value = ss_factor / ss_total
    return lambda_value


anova_table['lambda'] = anova_table.index.map(lambda factor: calculate_lambda(anova_table, factor))
print(anova_table)

                           sum_sq    df         F    PR(>F)     mean_sq  \
C(Method)              259.200000   1.0  5.543150  0.031660  259.200000   
C(AgeGroup)            140.833333   1.0  3.011807  0.101876  140.833333   
C(Method):C(AgeGroup)   10.800000   1.0  0.230965  0.637315   10.800000   
Residual               748.166667  16.0       NaN       NaN   46.760417   

                         lambda  
C(Method)              0.223641  
C(AgeGroup)            0.121513  
C(Method):C(AgeGroup)  0.009318  
Residual               0.645528  


In [None]:
import pandas as pd
import scipy.stats as stats

# Your existing data loading code
data = {'Method': ['Amazon', 'Amazon', 'Amazon', 'Amazon', 'Amazon', 'Amazon', 'Amazon', 'Amazon', 'Amazon', 'Amazon', 'Target', 'Target', 'Target', 'Target', 'Target', 'Target', 'Target', 'Target', 'Target', 'Target'],
        'AgeGroup': ['Younger', 'Younger', 'Older', 'Younger', 'Older', 'Younger', 'Older', 'Younger', 'Older', 'Younger', 'Younger', 'Younger', 'Older', 'Younger', 'Older', 'Younger', 'Older', 'Younger', 'Older', 'Younger'],
        'Score': [7, 11, 20, 12, 10, 20, 22, 12, 13, 12, 7, 12, 28, 15, 24, 35, 31, 21, 18, 20]}
df = pd.DataFrame(data)

# Create a new column 'Group' by combining 'Method' and 'AgeGroup'
df['Group'] = [m + '_' + a for m, a in zip(df['Method'], df['AgeGroup'])]

# Perform Kruskal-Wallis test on both groups combined
groups = df['Group'].unique()
data_for_kruskal = [df['Score'][df['Group'] == g] for g in groups]
h_statistic, p_value = stats.kruskal(*data_for_kruskal)

# h-stat quantifies variance in ranks between groups
print(f"H-statistic: {h_statistic}")
print(f"P-value: {p_value}")

# p value is insignificant if < 0.05

H-statistic: 6.882382762991127
P-value: 0.07574263435728143


In [None]:
import pandas as pd
import scipy.stats as stats

# Your existing data loading code

# Kruskal-Wallis test for Method
h_statistic_method, p_value_method = stats.kruskal(
    df['Score'][df['Method'] == 'Amazon'],
    df['Score'][df['Method'] == 'Target']
)
print(f"Method - H-statistic: {h_statistic_method}")
print(f"Method - P-value: {p_value_method}")

# Kruskal-Wallis test for AgeGroup
h_statistic_agegroup, p_value_agegroup = stats.kruskal(
    df['Score'][df['AgeGroup'] == 'Younger'],
    df['Score'][df['AgeGroup'] == 'Older']
)
print(f"AgeGroup - H-statistic: {h_statistic_agegroup}")
print(f"AgeGroup - P-value: {p_value_agegroup}")

# p value is insignificant if < 0.05

Method - H-statistic: 3.9069201520912493
Method - P-value: 0.04808764647392965
AgeGroup - H-statistic: 2.913814955640049
AgeGroup - P-value: 0.0878239058947686
