# ANOVA Analysis: Plant Growth Dataset

## Research Question
**Does the type of treatment significantly change the weight of the plants?**

### Dataset Overview
- **Source**: Classic dataset available in R (PlantGrowth) and Python statsmodels
- **Description**: Results from an experiment comparing yields on plants under a control and two different treatment conditions
- **Groups**: Control, Treatment 1, Treatment 2
- **Sample Size**: 30 plants (10 per group)
- **Dependent Variable**: Plant weight (continuous)
- **Independent Variable**: Treatment group (categorical)

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import f_oneway, shapiro, levene
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.anova import anova_lm
from statsmodels.formula.api import ols
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 1. Load and Explore the Data

In [None]:
# Load the PlantGrowth dataset
from statsmodels.datasets import get_rdataset

# Load dataset
plant_growth = get_rdataset('PlantGrowth', 'datasets')
df = plant_growth.data

print("Dataset Shape:", df.shape)
print("\nFirst 10 rows:")
df.head(10)

In [None]:
# Display dataset information
print("Dataset Information:")
print("="*50)
df.info()

print("\n" + "="*50)
print("Missing Values:")
print(df.isnull().sum())

print("\n" + "="*50)
print("Group Distribution:")
print(df['group'].value_counts())

## 2. Descriptive Statistics

In [None]:
# Overall statistics
print("Overall Weight Statistics:")
print("="*50)
print(df['weight'].describe())

print("\n" + "="*50)
print("\nStatistics by Group:")
print("="*50)
group_stats = df.groupby('group')['weight'].describe()
group_stats

In [None]:
# Additional statistics
print("Additional Group Statistics:")
print("="*50)

summary_df = pd.DataFrame({
    'Mean': df.groupby('group')['weight'].mean(),
    'Median': df.groupby('group')['weight'].median(),
    'Std Dev': df.groupby('group')['weight'].std(),
    'Variance': df.groupby('group')['weight'].var(),
    'Min': df.groupby('group')['weight'].min(),
    'Max': df.groupby('group')['weight'].max(),
    'Range': df.groupby('group')['weight'].max() - df.groupby('group')['weight'].min()
})

summary_df.round(3)

## 3. Data Visualization

In [None]:
# Create comprehensive visualization
fig, axes = plt.subplots(2, 3, figsize=(16, 10))

# 1. Box Plot
df.boxplot(column='weight', by='group', ax=axes[0, 0])
axes[0, 0].set_title('Box Plot: Weight by Treatment Group')
axes[0, 0].set_xlabel('Treatment Group')
axes[0, 0].set_ylabel('Plant Weight')
plt.sca(axes[0, 0])
plt.xticks(rotation=0)

# 2. Violin Plot
sns.violinplot(data=df, x='group', y='weight', ax=axes[0, 1])
axes[0, 1].set_title('Violin Plot: Weight Distribution by Group')
axes[0, 1].set_xlabel('Treatment Group')
axes[0, 1].set_ylabel('Plant Weight')

# 3. Strip Plot with Means
sns.stripplot(data=df, x='group', y='weight', alpha=0.6, size=8, ax=axes[0, 2])
means = df.groupby('group')['weight'].mean()
axes[0, 2].scatter(range(len(means)), means, color='red', s=300, 
                   marker='D', label='Mean', zorder=5, edgecolors='black', linewidths=2)
axes[0, 2].set_title('Individual Data Points with Means')
axes[0, 2].set_xlabel('Treatment Group')
axes[0, 2].set_ylabel('Plant Weight')
axes[0, 2].legend()

# 4. Bar Plot with Error Bars
means = df.groupby('group')['weight'].mean()
stds = df.groupby('group')['weight'].std()
axes[1, 0].bar(range(len(means)), means, yerr=stds, capsize=10, 
               alpha=0.7, color=['skyblue', 'lightgreen', 'coral'])
axes[1, 0].set_xticks(range(len(means)))
axes[1, 0].set_xticklabels(means.index)
axes[1, 0].set_title('Mean Weight with Standard Deviation')
axes[1, 0].set_xlabel('Treatment Group')
axes[1, 0].set_ylabel('Mean Plant Weight')
axes[1, 0].grid(axis='y', alpha=0.3)

# 5. Histogram
for group in df['group'].unique():
    group_data = df[df['group'] == group]['weight']
    axes[1, 1].hist(group_data, alpha=0.6, label=group, bins=5)
axes[1, 1].set_title('Weight Distribution by Group')
axes[1, 1].set_xlabel('Plant Weight')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()

# 6. Summary Statistics Table
axes[1, 2].axis('tight')
axes[1, 2].axis('off')
table_data = []
for group in df['group'].unique():
    group_data = df[df['group'] == group]['weight']
    table_data.append([
        group,
        f"{group_data.mean():.2f}",
        f"{group_data.std():.2f}",
        f"{len(group_data)}"
    ])

table = axes[1, 2].table(cellText=table_data,
                         colLabels=['Group', 'Mean', 'Std Dev', 'N'],
                         cellLoc='center',
                         loc='center')
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 2)
axes[1, 2].set_title('Summary Statistics Table')

plt.tight_layout()
plt.savefig('plant_growth_visualization.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nVisualization saved as 'plant_growth_visualization.png'")

## 4. Check ANOVA Assumptions

Before performing ANOVA, we need to verify:
1. **Normality**: Data in each group should be approximately normally distributed
2. **Homogeneity of Variance**: Variances should be equal across groups
3. **Independence**: Observations should be independent (assumed by design)

### 4.1 Normality Test (Shapiro-Wilk)

In [None]:
print("="*60)
print("NORMALITY TEST (Shapiro-Wilk)")
print("="*60)
print("Null Hypothesis: Data is normally distributed")
print("If p > 0.05: Fail to reject H0 (data is normal)\n")

normality_results = []

for group in df['group'].unique():
    group_data = df[df['group'] == group]['weight']
    stat, p_value = shapiro(group_data)
    
    normality_results.append({
        'Group': group,
        'Statistic': round(stat, 4),
        'P-value': round(p_value, 4),
        'Normal?': 'Yes ✓' if p_value > 0.05 else 'No ✗'
    })
    
    print(f"Group: {group}")
    print(f"  Statistic: {stat:.4f}")
    print(f"  P-value: {p_value:.4f}")
    print(f"  Interpretation: {'Data appears normal' if p_value > 0.05 else 'Data may not be normal'}\n")

normality_df = pd.DataFrame(normality_results)
print("\nSummary Table:")
print(normality_df.to_string(index=False))

### 4.2 Homogeneity of Variance Test (Levene's Test)

In [None]:
print("="*60)
print("HOMOGENEITY OF VARIANCE TEST (Levene's Test)")
print("="*60)
print("Null Hypothesis: All groups have equal variances")
print("If p > 0.05: Fail to reject H0 (variances are equal)\n")

groups = [df[df['group'] == group]['weight'] for group in df['group'].unique()]
stat, p_value = levene(*groups)

print(f"Levene's Statistic: {stat:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"\nInterpretation: {'Variances are equal across groups ✓' if p_value > 0.05 else 'Variances are NOT equal across groups ✗'}")

if p_value > 0.05:
    print("\n✓ Assumption satisfied: We can proceed with standard ANOVA")
else:
    print("\n⚠ Assumption violated: Consider Welch's ANOVA or transformation")

### 4.3 Visual Assessment of Assumptions

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Q-Q Plot (Overall)
stats.probplot(df['weight'], dist="norm", plot=axes[0, 0])
axes[0, 0].set_title('Q-Q Plot: Overall Weight Distribution')
axes[0, 0].grid(True, alpha=0.3)

# 2. Q-Q Plots by Group
for i, group in enumerate(df['group'].unique()):
    group_data = df[df['group'] == group]['weight']
    stats.probplot(group_data, dist="norm", plot=axes[0, 1])
    
axes[0, 1].set_title('Q-Q Plots by Group (overlaid)')
axes[0, 1].legend([f'{g}' for g in df['group'].unique()], loc='lower right')
axes[0, 1].grid(True, alpha=0.3)

# 3. Residuals Plot
for group in df['group'].unique():
    group_data = df[df['group'] == group]['weight']
    residuals = group_data - group_data.mean()
    axes[1, 0].scatter([group] * len(residuals), residuals, alpha=0.6, s=80)

axes[1, 0].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[1, 0].set_title('Residuals by Group')
axes[1, 0].set_xlabel('Treatment Group')
axes[1, 0].set_ylabel('Residuals')
axes[1, 0].grid(True, alpha=0.3)

# 4. Variance Comparison
variances = df.groupby('group')['weight'].var()
axes[1, 1].bar(range(len(variances)), variances.values, 
               color=['skyblue', 'lightgreen', 'coral'], alpha=0.7)
axes[1, 1].set_xticks(range(len(variances)))
axes[1, 1].set_xticklabels(variances.index)
axes[1, 1].set_title('Variance Comparison Across Groups')
axes[1, 1].set_xlabel('Treatment Group')
axes[1, 1].set_ylabel('Variance')
axes[1, 1].grid(axis='y', alpha=0.3)

# Add values on bars
for i, v in enumerate(variances.values):
    axes[1, 1].text(i, v + 0.05, f'{v:.3f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('plant_growth_assumptions.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nAssumption diagnostics saved as 'plant_growth_assumptions.png'")

## 5. Perform One-Way ANOVA

**Hypotheses:**
- **H₀ (Null)**: μ_control = μ_trt1 = μ_trt2 (all group means are equal)
- **H₁ (Alternative)**: At least one group mean is different

In [None]:
print("="*60)
print("ONE-WAY ANOVA RESULTS")
print("="*60)

# Extract groups
ctrl = df[df['group'] == 'ctrl']['weight']
trt1 = df[df['group'] == 'trt1']['weight']
trt2 = df[df['group'] == 'trt2']['weight']

# Perform ANOVA
f_stat, p_value = f_oneway(ctrl, trt1, trt2)

print(f"\nF-statistic: {f_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Calculate degrees of freedom
k = 3  # number of groups
n = len(df)  # total sample size
df_between = k - 1
df_within = n - k

print(f"\nDegrees of Freedom:")
print(f"  Between Groups: {df_between}")
print(f"  Within Groups: {df_within}")
print(f"  Total: {n - 1}")

# Calculate effect size (eta-squared)
grand_mean = df['weight'].mean()
ss_between = sum([len(df[df['group'] == g]) * (df[df['group'] == g]['weight'].mean() - grand_mean)**2 
                  for g in df['group'].unique()])
ss_total = sum((df['weight'] - grand_mean)**2)
eta_squared = ss_between / ss_total

print(f"\nEffect Size (Eta-squared η²): {eta_squared:.4f}")

# Interpret effect size
if eta_squared < 0.01:
    effect_interpretation = "negligible"
elif eta_squared < 0.06:
    effect_interpretation = "small"
elif eta_squared < 0.14:
    effect_interpretation = "medium"
else:
    effect_interpretation = "large"

print(f"Effect Size Interpretation: {effect_interpretation.upper()}")
print(f"({eta_squared*100:.1f}% of variance in weight is explained by treatment)")

# Decision
print("\n" + "="*60)
alpha = 0.05
if p_value < alpha:
    print(f"✓ REJECT NULL HYPOTHESIS (p = {p_value:.4f} < {alpha})")
    print("\nConclusion: There IS a statistically significant difference")
    print("in plant weight across the treatment groups.")
else:
    print(f"✗ FAIL TO REJECT NULL HYPOTHESIS (p = {p_value:.4f} >= {alpha})")
    print("\nConclusion: There is NO statistically significant difference")
    print("in plant weight across the treatment groups.")
print("="*60)

### Alternative: Using statsmodels for detailed ANOVA table

In [None]:
# Fit OLS model
model = ols('weight ~ C(group)', data=df).fit()

# Generate ANOVA table
anova_table = anova_lm(model, typ=2)

print("\nDetailed ANOVA Table:")
print("="*60)
print(anova_table)

# Model summary
print("\n" + "="*60)
print("Model Summary:")
print("="*60)
print(f"R-squared: {model.rsquared:.4f}")
print(f"Adjusted R-squared: {model.rsquared_adj:.4f}")

## 6. Post-Hoc Analysis: Tukey HSD Test

Since the ANOVA is significant, we perform pairwise comparisons to identify which specific groups differ.

In [None]:
print("="*60)
print("POST-HOC TEST: Tukey HSD (Honestly Significant Difference)")
print("="*60)
print("Purpose: Identify which specific groups differ from each other\n")

# Perform Tukey HSD test
tukey_result = pairwise_tukeyhsd(endog=df['weight'], groups=df['group'], alpha=0.05)

print(tukey_result)

# Create summary dataframe
tukey_df = pd.DataFrame(data=tukey_result.summary().data[1:], 
                        columns=tukey_result.summary().data[0])

print("\n" + "="*60)
print("Interpretation:")
print("="*60)

for idx, row in tukey_df.iterrows():
    group1, group2 = row['group1'], row['group2']
    reject = row['reject']
    meandiff = float(row['meandiff'])
    p_adj = float(row['p-adj'])
    
    if reject:
        print(f"\n✓ {group1} vs {group2}:")
        print(f"  Mean difference: {meandiff:.3f}")
        print(f"  Adjusted p-value: {p_adj:.4f}")
        print(f"  Conclusion: Significantly different")
    else:
        print(f"\n✗ {group1} vs {group2}:")
        print(f"  Mean difference: {meandiff:.3f}")
        print(f"  Adjusted p-value: {p_adj:.4f}")
        print(f"  Conclusion: No significant difference")

### Visualize Post-Hoc Results

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 1. Mean comparison with confidence intervals
means = df.groupby('group')['weight'].mean()
stds = df.groupby('group')['weight'].std()
n_per_group = df.groupby('group').size()
ci = 1.96 * stds / np.sqrt(n_per_group)  # 95% CI

x_pos = range(len(means))
axes[0].bar(x_pos, means, yerr=ci, capsize=10, alpha=0.7, 
            color=['skyblue', 'lightgreen', 'coral'])
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels(means.index)
axes[0].set_title('Mean Weight with 95% Confidence Intervals', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Treatment Group')
axes[0].set_ylabel('Mean Plant Weight')
axes[0].grid(axis='y', alpha=0.3)

# Add significance annotations
y_max = means.max() + ci.max()
if p_value < 0.05:
    # Add significance bars based on Tukey results
    for idx, row in tukey_df.iterrows():
        if row['reject']:
            group1_idx = list(means.index).index(row['group1'])
            group2_idx = list(means.index).index(row['group2'])
            
            y_pos = y_max + 0.3 * (idx + 1)
            axes[0].plot([group1_idx, group2_idx], [y_pos, y_pos], 'k-', linewidth=1.5)
            axes[0].text((group1_idx + group2_idx) / 2, y_pos + 0.1, '*', 
                        ha='center', va='bottom', fontsize=16)

# 2. Tukey HSD confidence intervals
tukey_result.plot_simultaneous(xlabel='Weight Difference', 
                               ylabel='Group Comparison', ax=axes[1])
axes[1].set_title('Tukey HSD: Simultaneous Confidence Intervals', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('plant_growth_posthoc.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nPost-hoc visualization saved as 'plant_growth_posthoc.png'")

## 7. Final Summary and Conclusions

In [None]:
print("="*60)
print("FINAL SUMMARY: Plant Growth ANOVA Analysis")
print("="*60)

print("\n1. RESEARCH QUESTION:")
print("   Does the type of treatment significantly change the weight of plants?")

print("\n2. SAMPLE SIZE:")
for group in df['group'].unique():
    n = len(df[df['group'] == group])
    mean = df[df['group'] == group]['weight'].mean()
    print(f"   {group}: n = {n}, mean = {mean:.3f}")

print("\n3. ASSUMPTION CHECKS:")
all_normal = all([r['Normal?'] == 'Yes ✓' for r in normality_results])
print(f"   Normality: {'✓ PASSED' if all_normal else '✗ FAILED'}")
print(f"   Homogeneity of Variance: {'✓ PASSED' if p_value > 0.05 else '✗ FAILED'}")

print("\n4. ANOVA RESULTS:")
print(f"   F-statistic: {f_stat:.4f}")
print(f"   P-value: {p_value:.4f}")
print(f"   Effect Size (η²): {eta_squared:.4f} ({effect_interpretation})")

print("\n5. STATISTICAL DECISION:")
if p_value < 0.05:
    print("   ✓ SIGNIFICANT RESULT (p < 0.05)")
    print("   Treatment type DOES significantly affect plant weight.")
else:
    print("   ✗ NON-SIGNIFICANT RESULT (p >= 0.05)")
    print("   Treatment type DOES NOT significantly affect plant weight.")

print("\n6. POST-HOC FINDINGS (if applicable):")
if p_value < 0.05:
    for idx, row in tukey_df.iterrows():
        if row['reject']:
            print(f"   • {row['group1']} vs {row['group2']}: "
                  f"Significant difference (p = {float(row['p-adj']):.4f})")
else:
    print("   N/A - Overall ANOVA was not significant")

print("\n7. PRACTICAL INTERPRETATION:")
if p_value < 0.05:
    best_group = means.idxmax()
    best_mean = means.max()
    print(f"   • {best_group} produced the highest mean weight ({best_mean:.3f})")
    print(f"   • Treatment choice matters for plant growth outcomes")
    print(f"   • {eta_squared*100:.1f}% of weight variation is explained by treatment")
else:
    print("   • No evidence that treatment type affects plant weight")
    print("   • All treatments produce similar results on average")

print("\n" + "="*60)
print("Analysis Complete!")
print("="*60)

## 8. Additional Analysis: Effect Size Confidence Intervals

In [None]:
# Bootstrap confidence intervals for eta-squared
def bootstrap_eta_squared(data, n_bootstrap=1000):
    """Calculate bootstrap confidence intervals for eta-squared"""
    eta_squared_values = []
    
    for _ in range(n_bootstrap):
        # Resample with replacement
        sample = data.sample(n=len(data), replace=True)
        
        # Calculate eta-squared for this sample
        groups_sample = [sample[sample['group'] == g]['weight'] for g in sample['group'].unique()]
        
        if len(groups_sample) == 3:  # Ensure all groups present
            grand_mean = sample['weight'].mean()
            ss_between = sum([len(sample[sample['group'] == g]) * 
                            (sample[sample['group'] == g]['weight'].mean() - grand_mean)**2 
                            for g in sample['group'].unique()])
            ss_total = sum((sample['weight'] - grand_mean)**2)
            
            if ss_total > 0:
                eta_squared_values.append(ss_between / ss_total)
    
    return eta_squared_values

print("Calculating bootstrap confidence intervals for effect size...")
bootstrap_eta = bootstrap_eta_squared(df, n_bootstrap=1000)

ci_lower = np.percentile(bootstrap_eta, 2.5)
ci_upper = np.percentile(bootstrap_eta, 97.5)

print(f"\nEffect Size (η²) = {eta_squared:.4f}")
print(f"95% CI: [{ci_lower:.4f}, {ci_upper:.4f}]")

# Visualize
plt.figure(figsize=(10, 5))
plt.hist(bootstrap_eta, bins=30, alpha=0.7, edgecolor='black')
plt.axvline(eta_squared, color='red', linestyle='--', linewidth=2, label=f'Observed η² = {eta_squared:.4f}')
plt.axvline(ci_lower, color='green', linestyle=':', linewidth=2, label=f'95% CI')
plt.axvline(ci_upper, color='green', linestyle=':', linewidth=2)
plt.xlabel('Eta-squared (η²)')
plt.ylabel('Frequency')
plt.title('Bootstrap Distribution of Effect Size (η²)')
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.savefig('plant_growth_effect_size_ci.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nEffect size CI visualization saved as 'plant_growth_effect_size_ci.png'")

## Conclusion

This notebook has provided a comprehensive ANOVA analysis of the Plant Growth dataset, including:

1. ✓ Data exploration and descriptive statistics
2. ✓ Comprehensive visualizations
3. ✓ Assumption checking (normality and homogeneity)
4. ✓ One-way ANOVA test
5. ✓ Effect size calculation
6. ✓ Post-hoc pairwise comparisons (Tukey HSD)
7. ✓ Confidence intervals for effect size

**Key Findings**: The analysis reveals whether different plant treatments lead to statistically significant differences in plant weight, with specific identification of which treatment pairs differ significantly.