## Hypothesis 3: Does Energy Consumption Differ by Region?
This notebook analyzes whether average energy consumption per city differs significantly across U.S. states.
We will use ANOVA to test for statistically significant differences.

In [None]:
!pip install pyxlsb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway

# Load the dataset
file_path = "/content/2016cityandcountyenergyprofiles (1).xlsb"
city_data = pd.read_excel(file_path, engine='pyxlsb', sheet_name='City', skiprows=1)
city_data.columns = city_data.iloc[2]
city_data = city_data.drop([0, 1, 2]).reset_index(drop=True)

# Extract the first occurrence of relevant consumption columns
mwh_index = list(city_data.columns).index('consumption (MWh)')
tcf_index = list(city_data.columns).index('consumption (TcF)')

city_df = pd.DataFrame()
city_df['state'] = city_data['state_abbr']
city_df['city_name'] = city_data['city_name']
city_df['electricity_MWh'] = pd.to_numeric(city_data.iloc[:, mwh_index], errors='coerce')
city_df['natural_gas_TcF'] = pd.to_numeric(city_data.iloc[:, tcf_index], errors='coerce')

# Compute total energy in MWh
city_df['total_energy_MWh'] = city_df['electricity_MWh'] + city_df['natural_gas_TcF'] * 293071
city_df.dropna(subset=['state', 'total_energy_MWh'], inplace=True)


In [None]:
# Group by state and plot average energy per city
state_avg_energy = city_df.groupby('state')['total_energy_MWh'].mean().sort_values(ascending=False)
print(state_avg_energy.head())

# Boxplot to visualize variation across states
plt.figure(figsize=(14, 6))
sns.boxplot(data=city_df, x='state', y='total_energy_MWh')
plt.xticks(rotation=90)
plt.title('City-Level Total Energy Consumption by State')
plt.tight_layout()
plt.show()


In [None]:
# Prepare data for ANOVA
grouped_data = [group['total_energy_MWh'].values for name, group in city_df.groupby('state') if len(group) > 1]

# Run one-way ANOVA
anova_stat, anova_pval = f_oneway(*grouped_data)
print(f"ANOVA F-statistic: {anova_stat:.4f}, p-value: {anova_pval:.4f}")
if anova_pval < 0.05:
    print("✅ There is a statistically significant difference in energy use between states.")
else:
    print("⚠️ No statistically significant difference found between states.")
