In [328]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

### Create Dataframe

In [329]:
# NOTE: this cell was only ran once, doesn't need to be run each time
# pollutants = ['CO/', 'NO2/', 'Ozone/', 'PM2.5/', 'PM10/', 'SO2/']

# # read in files and create dataframe of all data for each pollutant
# for pollutant in pollutants:
#     # empty list to store dfs
#     dfs = []
#     # iterate through files in the directory
#     for filename in os.listdir('data/' + pollutant):
#         df = pd.read_csv('data/' + pollutant + filename)
#         dfs.append(df)

#     # concatenate all dfs in the list
#     merged_df = pd.concat(dfs, ignore_index=True)
#     df = merged_df.drop(columns=["POC", "Source", "Local Site Name", "Percent Complete", "AQS Parameter Code", "Method Code", "CBSA Code", "CBSA Name", "State FIPS Code", "State"])

#     # save the merged DataFrame to a new CSV file
#     df.to_csv('data/All/all_' + pollutant[:-1] + '.csv', index=False)

In [26]:
# NOTE: feel free to edit this cell to make the data cleaner, but otherwise it doesn't need to be run
# read in dataframes for each pollutant and merge into 1
# df_CO = pd.read_csv('data/All/all_CO.csv')
# df_NO2 = pd.read_csv('data/All/all_NO2.csv')
# df_Ozone = pd.read_csv('data/All/all_Ozone.csv')
# df_PM2_5 = pd.read_csv('data/All/all_PM2.5.csv')
# df_PM10 = pd.read_csv('data/All/all_PM10.csv')
# df_SO2 = pd.read_csv('data/All/all_SO2.csv')

# cols = ['Date', 'Site ID', 'County FIPS Code', 'County', 'Site Latitude', 'Site Longitude']

# mergedCO_NO2 = df_CO.merge(df_NO2, on=cols, how="outer", suffixes=('CO', 'NO2'))
# mergedOzone_PM2 = df_Ozone.merge(df_PM2_5, on=cols, how="outer", suffixes=('Ozone', 'PM25'))
# mergedPM10_SO2 = df_PM10.merge(df_SO2, on=cols, how="outer", suffixes=('PM10', 'SO2'))
# mergedCO_NO2_Ozone_PM2 = mergedCO_NO2.merge(mergedOzone_PM2, on=cols, how="outer")
# all_merged = mergedCO_NO2_Ozone_PM2.merge(df_PM10, on=cols, how='outer')
# all_merged = all_merged[all_merged["AQS Parameter DescriptionPM25"] != "Acceptable PM2.5 AQI & Speciation Mass"]
# all_merged.to_csv('data/All/all_merged.csv')

In [27]:
# read in dataframe
df = pd.read_csv('data/All/all_merged.csv', usecols=lambda column: "Unnamed" not in column)

In [28]:
# Display first 5 rows of data
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,Date,Site ID,Daily Max 8-hour CO Concentration,UnitsCO,Daily AQI ValueCO,Daily Obs CountCO,AQS Parameter DescriptionCO,County FIPS Code,County,Site Latitude,Site Longitude,Daily Max 1-hour NO2 Concentration,UnitsNO2,Daily AQI ValueNO2,Daily Obs CountNO2,AQS Parameter DescriptionNO2,Daily Max 8-hour Ozone Concentration,UnitsOzone,Daily AQI ValueOzone,Daily Obs CountOzone,AQS Parameter DescriptionOzone,Daily Mean PM2.5 Concentration,UnitsPM25,Daily AQI ValuePM25,Daily Obs CountPM25,AQS Parameter DescriptionPM25,Method Description,Daily Mean PM10 Concentration,Units,Daily AQI Value,Daily Obs Count,AQS Parameter Description
0,01/01/2015,490030003,,,,,,3,Box Elder,41.492707,-112.018863,,,,,,,,,,,5.0,ug/m3 LC,28.0,1.0,PM2.5 - Local Conditions,Thermo Scientific TEOM 1400 FDMS or 1405 8500C...,,,,,
1,01/01/2015,490037001,,,,,,3,Box Elder,41.945874,-112.233973,,,,,,0.036,ppm,33.0,17.0,Ozone,,,,,,,,,,,
2,01/01/2015,490050004,,,,,,5,Cache,41.731111,-111.8375,35.0,ppb,33.0,24.0,Nitrogen dioxide (NO2),0.027,ppm,25.0,17.0,Ozone,16.2,ug/m3 LC,64.0,1.0,PM2.5 - Local Conditions,R & P Model 2025 PM-2.5 Sequential Air Sampler...,,,,,
3,01/01/2015,490050004,,,,,,5,Cache,41.731111,-111.8375,35.0,ppb,33.0,24.0,Nitrogen dioxide (NO2),0.027,ppm,25.0,17.0,Ozone,15.3,ug/m3 LC,63.0,1.0,PM2.5 - Local Conditions,Thermo Scientific TEOM 1400 FDMS or 1405 8500C...,,,,,
4,01/01/2015,490071003,,,,,,7,Carbon,39.59575,-110.770111,,,,,,0.045,ppm,42.0,17.0,Ozone,,,,,,,,,,,


Data Type Corrections

In [333]:
# Convert object columns to string
df[df.select_dtypes(include=['object']).columns] = df.select_dtypes(include=['object']).astype('string')

# Convert date column to datetime object
df['Date'] = pd.to_datetime(df['Date'])

Feature Scaling (we may not want to do this?)

In [334]:
# Select only float columns
df_floats = df.select_dtypes(include=['float']).copy()

# Apply MinMaxScaler, allowing NaN values to persist
scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(df_floats), columns=df_floats.columns, index=df.index)
df = df.replace({pd.NA: np.nan})

Impute Missing Values (Warning: this cell takes roughly 20 minutes to run)

In [None]:
# Drop rows with at least 10 missing values
df = df.dropna(thresh=df.shape[1] - 10 + 1)  # Keep rows with at least (total columns - 15 + 1) non-NaN value

# Using KNN to impute the remaining missing values in the DataFrame. 
# The highest percentage of missing values in any single column is approximately 10%, 
# making KNN a suitable imputation method.

# Initialize KNN imputer
imputer = KNNImputer(n_neighbors=3)

# Apply KNN imputation
df[df.columns] = df[df.columns].apply(pd.to_numeric, errors='coerce')
df = df.copy()  # Preserve original DataFrame
df[df.columns] = imputer.fit_transform(df[df.columns])