In [1]:
import pandas as pd
import numpy as np



In [3]:
# Step 1: Load PM2.5 data
df_alabama = pd.read_csv("data/Albama.csv")
df_california = pd.read_csv("data/California.csv")



In [4]:
# Step 2: Combine both datasets
df_pm25 = pd.concat([df_alabama, df_california], ignore_index=True)



In [5]:
# Step 3: Aggregate PM2.5 by State and County
pm25_by_county = (
    df_pm25.groupby(['State', 'County'])['Daily Mean PM2.5 Concentration']
    .mean()
    .reset_index()
    .rename(columns={'Daily Mean PM2.5 Concentration': 'Average_PM2.5_2020'})
)



In [6]:
# Step 4: Simulate heating-related features
np.random.seed(42)
pm25_by_county["Avg_Income_Bracket"] = np.random.randint(1, 6, size=len(pm25_by_county))  # 1=lowest, 5=highest
pm25_by_county["Wood_Usage_%"] = np.round(np.random.uniform(3, 20, size=len(pm25_by_county)), 2)



In [7]:
# Step 5: Calculate Subsidy Priority Score
pm25_by_county["Subsidy_Priority_Score"] = (
    pm25_by_county["Wood_Usage_%"]
    * (6 - pm25_by_county["Avg_Income_Bracket"])
    * pm25_by_county["Average_PM2.5_2020"]
).round(2)



In [8]:
# Step 6: Save to CSV
pm25_by_county.to_csv("merged_subsidy_priority_data.csv", index=False)

print("✅ File saved as 'merged_subsidy_priority_data.csv'")


✅ File saved as 'merged_subsidy_priority_data.csv'
