In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

df = pd.read_csv('cleaned_aqi_data.csv')
print(df.head())

   state_code  county_code  year  parameter_code  first_max_value  \
0           6            1  2018           42101              3.3   
1           6            1  2018           42101              3.6   
2           6            1  2018           42101              2.1   
3           6            1  2018           42101              2.3   
4           6            1  2018           42101              2.6   

   arithmetic_mean  observation_count  primary_exceedance_count  \
0         0.414305               8354                       0.0   
1         0.515045               8315                       0.0   
2         0.454929               8287                       0.0   
3         0.528526               6289                       0.0   
4         0.566017               8004                       0.0   

   ninetieth_percentile County_ID Pollutant  Pct_Unhealthy  Max_Last_Year  \
0                   0.7    06_001        CO            0.0            NaN   
1                   0.9    0

In [2]:
clustering_data = df.groupby('County_ID').agg({
    'arithmetic_mean': 'mean',         
    'ninetieth_percentile': 'mean',    
    'primary_exceedance_count': 'mean', 
    'first_max_value': 'max'            
}).fillna(0)

clustering_data.columns = ['pollution_avg', 'bad_day_level', 'very_bad_days_count', 'worst_everr_surge']

In [3]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(clustering_data)

In [4]:
kmeans = KMeans(n_clusters=3, random_state=42)
clustering_data['Cluster'] = kmeans.fit_predict(data_scaled)

In [None]:
sil_score = silhouette_score(data_scaled, clustering_data['Cluster'])
print(f"Cluster Quality: {sil_score:.3f}")

cluster_means = clustering_data.groupby('Cluster')['very_bad_days_count'].mean()
high_risk_cluster_id = cluster_means.idxmax()

clustering_data['actually_dangerous'] = clustering_data['very_bad_days_count'] > 5 
clustering_data['model_said_dangerous'] = clustering_data['Cluster'] == high_risk_cluster_id

correct_predictions = (clustering_data['actually_dangerous'] == clustering_data['model_said_dangerous']).sum()
total_predictions = len(clustering_data)
accuracy = correct_predictions / total_predictions

print(f"Model Accuracy: {accuracy:.2%}")
print(f"Model Inertia: {kmeans.inertia_:.2f}")

Cluster Quality: 0.551
Model Accuracy: 79.82%
Model Inertia: 363.72
           pollution_avg  bad_day_level  very_bad_days_count  \
County_ID                                                      
06_011         11.647488      24.391023             6.856061   
06_019         13.844945      28.122681            13.769774   
06_025         19.316396      33.612230             4.588583   
06_027         10.660072      20.736066             4.789130   
06_029         12.963629      24.828936            11.952522   
06_031         22.736829      46.609115            19.403704   
06_037         10.127530      16.849028            10.964549   
06_039         12.804720      26.981939            10.755725   
06_047         11.903138      23.885484            12.593750   
06_051         15.337512      28.553938             7.627685   
06_063         15.683462      30.468023             7.500000   
06_065         10.640693      17.816860             7.842982   
06_071         10.638155      17.090