In [None]:
import requests
import csv

# set the API endpoint and parameters
api_endpoint = 'https://www.airnowapi.org/aq/observation/latLong/current/'
api_params = {
    'format': 'application/json',
    'latitude': '37.335480',
    'longitude': '-121.893028',
    'distance': '25',
    'API_KEY': '5D892B12-26BC-45C9-9A78-CC50B20DCF26'
}

# make a GET request to the API
response = requests.get(api_endpoint, params=api_params)

# check if the request was successful
if response.status_code == 200:
    # extract the data from the response JSON
    data = response.json()

    # write the data to a CSV file
    from google.colab import drive
    drive.mount('/drive')
    with open('/drive/Shared drives/DATA245/airnow.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # write the header row
        writer.writerow(['DateObserved', 'HourObserved', 'LocalTimeZone', 'ReportingArea', 'StateCode', 'Latitude', 'Longitude', 'ParameterName', 'AQI'])
        # write each observation as a row in the CSV file
        for observation in data:
            writer.writerow([observation['DateObserved'], observation['HourObserved'], observation['LocalTimeZone'], observation['ReportingArea'], observation['StateCode'], observation['Latitude'], observation['Longitude'], observation['ParameterName'], observation['AQI']])
else:
    print('Error:', response.status_code)

Mounted at /drive


In [1]:
from google.colab import drive
drive.mount('/drive')
import pandas as pd

# Load data from the CSV files
ozone_data_2023 = pd.read_csv('/drive/Shared drives/DATA245/Ozone_2023.csv')
pm25_data_2023 = pd.read_csv('/drive/Shared drives/DATA245/PM2.5_2023.csv')
ozone_data_2022 = pd.read_csv('/drive/Shared drives/DATA245/Ozone_2022.csv')
pm25_data_2022 = pd.read_csv('/drive/Shared drives/DATA245/PM2.5_2022.csv')
ozone_data_2021 = pd.read_csv('/drive/Shared drives/DATA245/Ozone_2021.csv')
pm25_data_2021 = pd.read_csv('/drive/Shared drives/DATA245/PM2.5_2021.csv')
ozone_data_2020 = pd.read_csv('/drive/Shared drives/DATA245/Ozone_2020.csv')
pm25_data_2020 = pd.read_csv('/drive/Shared drives/DATA245/PM2.5_2020.csv')
airnow_data = pd.read_csv('/drive/Shared drives/DATA245/airnow.csv')

ozone_data = pd.concat([ozone_data_2020, ozone_data_2021, ozone_data_2022, ozone_data_2023], ignore_index=True)
pm25_data = pd.concat([pm25_data_2020, pm25_data_2021, pm25_data_2022, pm25_data_2023], ignore_index=True)

Mounted at /drive


In [None]:
#Exploratory Data Analysis
#Check data shape and types
print("Airnow data shape: ", airnow_data.shape)
print("Ozone data shape: ", ozone_data.shape)
print("PM2.5 data shape: ", pm25_data.shape)

print("\nAirnow data types: \n", airnow_data.dtypes)
print("\nOzone data types: \n", ozone_data.dtypes)
print("\nPM2.5 data types: \n", pm25_data.dtypes)

#Summary statistics
print("\nSummary statistics of Airnow data: \n", airnow_data.describe())
print("\nSummary statistics of Ozone data: \n", ozone_data.describe())
print("\nSummary statistics of PM2.5 data: \n", pm25_data.describe())

#Check for missing values
print("\nMissing values in Airnow data: \n", airnow_data.isnull().sum())
print("\nMissing values in Ozone data: \n", ozone_data.isnull().sum())
print("\nMissing values in PM2.5 data: \n", pm25_data.isnull().sum())

Airnow data shape:  (2, 9)
Ozone data shape:  (190583, 20)
PM2.5 data shape:  (183768, 20)

Airnow data types: 
 DateObserved      object
HourObserved       int64
LocalTimeZone     object
ReportingArea     object
StateCode         object
Latitude         float64
Longitude        float64
ParameterName     object
AQI                int64
dtype: object

Ozone data types: 
 Date                                     object
Source                                   object
Site ID                                   int64
POC                                       int64
Daily Max 8-hour Ozone Concentration    float64
UNITS                                    object
DAILY_AQI_VALUE                           int64
Site Name                                object
DAILY_OBS_COUNT                           int64
PERCENT_COMPLETE                        float64
AQS_PARAMETER_CODE                        int64
AQS_PARAMETER_DESC                       object
CBSA_CODE                               float64
CBS

In [None]:
trial = ozone_data

In [2]:
import pandas as pd

# Load data from the CSV files
#ozone_data = pd.read_csv('/drive/Shared drives/DATA245/Ozone_California_data.csv')
#pm25_data = pd.read_csv('/drive/Shared drives/DATA245/PM2.5_California_data.csv')
#airnow_data = pd.read_csv('/drive/Shared drives/DATA245/airnow.csv')

# Preprocess the data
# Drop unnecessary columns from ozone_data and pm25_data
ozone_data.drop(columns=['Source', 'POC', 'UNITS', 'Site Name', 'PERCENT_COMPLETE', 'AQS_PARAMETER_CODE', 'AQS_PARAMETER_DESC', 'CBSA_CODE', 'CBSA_NAME', 'STATE_CODE', 'STATE', 'COUNTY_CODE', 'COUNTY'], inplace=True)
pm25_data.drop(columns=['Source', 'POC', 'UNITS', 'Site Name', 'PERCENT_COMPLETE', 'AQS_PARAMETER_CODE', 'AQS_PARAMETER_DESC', 'CBSA_CODE', 'CBSA_NAME', 'STATE_CODE', 'STATE', 'COUNTY_CODE', 'COUNTY'], inplace=True)

# Rename columns for consistency
ozone_data.rename(columns={'Daily Max 8-hour Ozone Concentration': 'Ozone', 'DAILY_AQI_VALUE_x': 'Ozone AQI'}, inplace=True)
pm25_data.rename(columns={'Daily Mean PM2.5 Concentration': 'PM2.5', 'DAILY_AQI_VALUE_y': 'PM2.5 AQI'}, inplace=True)

# Combine ozone_data and pm25_data into one dataframe
combined_data = pd.merge(ozone_data, pm25_data, on=['Date', 'Site ID'])

In [None]:
# Before label encoding
coords_before = list(zip(combined_data['Site ID'], combined_data['SITE_LATITUDE_x'], combined_data['SITE_LONGITUDE_x'], combined_data['SITE_LATITUDE_y'], combined_data['SITE_LONGITUDE_y']))

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Instantiate a LabelEncoder object
label_encoder = LabelEncoder()

# Encode the categorical column
combined_data['SITE_LATITUDE_x'] = label_encoder.fit_transform(combined_data['SITE_LATITUDE_x'])
combined_data['SITE_LONGITUDE_x'] = label_encoder.fit_transform(combined_data['SITE_LONGITUDE_x'])
combined_data['SITE_LATITUDE_y'] = label_encoder.fit_transform(combined_data['SITE_LATITUDE_y'])
combined_data['SITE_LONGITUDE_y'] = label_encoder.fit_transform(combined_data['SITE_LONGITUDE_y'])

# Print the encoded dataframe
combined_data.head()


Unnamed: 0,Date,Site ID,Ozone,DAILY_AQI_VALUE_x,DAILY_OBS_COUNT_x,SITE_LATITUDE_x,SITE_LONGITUDE_x,PM2.5,DAILY_AQI_VALUE_y,DAILY_OBS_COUNT_y,SITE_LATITUDE_y,SITE_LONGITUDE_y
0,01/01/2020,60010007,0.025,23,17,117,21,8.6,36,1,117,21
1,01/02/2020,60010007,0.017,16,17,117,21,4.5,19,1,117,21
2,01/03/2020,60010007,0.013,12,17,117,21,14.2,55,1,117,21
3,01/04/2020,60010007,0.028,26,17,117,21,10.9,45,1,117,21
4,01/05/2020,60010007,0.031,29,17,117,21,7.8,33,1,117,21


In [None]:
# After label encoding
coords_after = list(zip(combined_data['Site ID'], combined_data['SITE_LATITUDE_x'], combined_data['SITE_LONGITUDE_x'], combined_data['SITE_LATITUDE_y'], combined_data['SITE_LONGITUDE_y']))

In [None]:
# Print the list of tuples
print("Coordinates before encoding:", coords_before)
print("Coordinates after encoding:", coords_after)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
# Create a DataFrame from the list of tuples

df = pd.DataFrame(coords_before, columns=['Site ID', 'Latitude (before)', 'Longitude (before)', 'Latitude (after)', 'Longitude (after)'])
df.drop(['Latitude (after)', 'Longitude (after)'], axis=1)

# Create a DataFrame from the list of tuples
df1 = pd.DataFrame(coords_after, columns=['Site ID', 'Latitude (before)', 'Longitude (before)', 'Latitude (after)', 'Longitude (after)'])
df1.drop(['Latitude (before)', 'Longitude (before)'], axis=1)

merged_df = pd.merge(df, df1, on='Site ID')

merged_trial_df = pd.merge(merged_df, trial[['Site ID', 'County']], on=['Site ID'], how='inner')

In [None]:
merged_trial_df.head()

In [None]:
# Export the DataFrame to a CSV file
merged_trial_df.to_csv('/drive/Shared drives/DATA245/coordinates_before_after_encoding.csv', index=False)

In [None]:
combined_data['SITE_LATITUDE_x'].describe()

count    128640.000000
mean         84.071634
std          45.448991
min           0.000000
25%          43.000000
50%          88.000000
75%         123.000000
max         162.000000
Name: SITE_LATITUDE_x, dtype: float64

In [None]:
combined_data.head()

Unnamed: 0,Date,Site ID,Ozone,DAILY_AQI_VALUE_x,DAILY_OBS_COUNT_x,SITE_LATITUDE_x,SITE_LONGITUDE_x,PM2.5,DAILY_AQI_VALUE_y,DAILY_OBS_COUNT_y,SITE_LATITUDE_y,SITE_LONGITUDE_y
0,01/01/2020,60010007,0.025,23,17,117,21,8.6,36,1,117,21
1,01/02/2020,60010007,0.017,16,17,117,21,4.5,19,1,117,21
2,01/03/2020,60010007,0.013,12,17,117,21,14.2,55,1,117,21
3,01/04/2020,60010007,0.028,26,17,117,21,10.9,45,1,117,21
4,01/05/2020,60010007,0.031,29,17,117,21,7.8,33,1,117,21


In [None]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 128640 entries, 0 to 128639
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Date               128640 non-null  object 
 1   Site ID            128640 non-null  int64  
 2   Ozone              128640 non-null  float64
 3   DAILY_AQI_VALUE_x  128640 non-null  int64  
 4   DAILY_OBS_COUNT_x  128640 non-null  int64  
 5   SITE_LATITUDE_x    128640 non-null  int64  
 6   SITE_LONGITUDE_x   128640 non-null  int64  
 7   PM2.5              128640 non-null  float64
 8   DAILY_AQI_VALUE_y  128640 non-null  int64  
 9   DAILY_OBS_COUNT_y  128640 non-null  int64  
 10  SITE_LATITUDE_y    128640 non-null  int64  
 11  SITE_LONGITUDE_y   128640 non-null  int64  
dtypes: float64(2), int64(9), object(1)
memory usage: 12.8+ MB


In [None]:
combined_data.columns

Index(['Date', 'Site ID', 'Ozone', 'DAILY_AQI_VALUE_x', 'DAILY_OBS_COUNT_x',
       'SITE_LATITUDE_x', 'SITE_LONGITUDE_x', 'PM2.5', 'DAILY_AQI_VALUE_y',
       'DAILY_OBS_COUNT_y', 'SITE_LATITUDE_y', 'SITE_LONGITUDE_y'],
      dtype='object')

In [None]:
combined_data.describe()

Unnamed: 0,Site ID,Ozone,DAILY_AQI_VALUE_x,DAILY_OBS_COUNT_x,SITE_LATITUDE_x,SITE_LONGITUDE_x,PM2.5,DAILY_AQI_VALUE_y,DAILY_OBS_COUNT_y,SITE_LATITUDE_y,SITE_LONGITUDE_y
count,7103.0,7103.0,7103.0,7103.0,7103.0,7103.0,7103.0,7103.0,7103.0,7103.0,7103.0
mean,60600470.0,0.037456,34.722652,23.603829,36.172476,-119.819657,5.926313,23.941996,1.0,36.172476,-119.819657
std,312770.4,0.007275,6.908009,1.493211,2.127606,1.895278,4.622077,16.501308,0.0,2.127606,1.895278
min,60010010.0,0.002,2.0,16.0,32.57816,-122.818294,-2.6,0.0,1.0,32.57816,-122.818294
25%,60370020.0,0.033,31.0,24.0,34.2431,-121.574684,2.9,12.0,1.0,34.2431,-121.574684
50%,60659000.0,0.038,35.0,24.0,36.48187,-119.8284,4.8,20.0,1.0,36.48187,-119.8284
75%,60831010.0,0.042,39.0,24.0,37.814781,-118.205,7.8,33.0,1.0,37.814781,-118.205
max,61131000.0,0.062,74.0,24.0,41.726892,-115.48307,37.0,105.0,1.0,41.726892,-115.48307


Linear Regression NEW

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Split the data into training and testing sets
train_data = combined_data[combined_data['Date'] < '02/28/2023']
test_data = combined_data[combined_data['Date'] >= '02/28/2023']

# Prepare the data for machine learning
X_train_ozone = train_data[['SITE_LATITUDE_x', 'SITE_LONGITUDE_x', 'Ozone']]
y_train_ozone = train_data['DAILY_AQI_VALUE_x']
X_test_ozone = test_data[['SITE_LATITUDE_x', 'SITE_LONGITUDE_x', 'Ozone']]
y_test_ozone = test_data['DAILY_AQI_VALUE_x']

X_train_pm25 = train_data[['SITE_LATITUDE_y', 'SITE_LONGITUDE_y', 'PM2.5']]
y_train_pm25 = train_data['DAILY_AQI_VALUE_y']
X_test_pm25 = test_data[['SITE_LATITUDE_y', 'SITE_LONGITUDE_y', 'PM2.5']]
y_test_pm25 = test_data['DAILY_AQI_VALUE_y']

# Train a linear regression model for Ozone AQI
lr_model_ozone = LinearRegression()
lr_model_ozone.fit(X_train_ozone, y_train_ozone)
lr_predictions_ozone = lr_model_ozone.predict(X_test_ozone)
lr_r2_ozone = r2_score(y_test_ozone, lr_predictions_ozone)

# Train a linear regression model for PM 2.5 AQI
lr_model_pm25 = LinearRegression()
lr_model_pm25.fit(X_train_pm25, y_train_pm25)
lr_predictions_pm25 = lr_model_pm25.predict(X_test_pm25)
lr_r2_pm25 = r2_score(y_test_pm25, lr_predictions_pm25)

print("R-squared score for Ozone AQI:", lr_r2_ozone)
print("R-squared score for PM 2.5 AQI:", lr_r2_pm25)

#Calculate MAE for Ozone AQI
lr_mae_ozone = mean_absolute_error(y_test_ozone, lr_predictions_ozone)

#Calculate MSE for Ozone AQI
lr_mse_ozone = mean_squared_error(y_test_ozone, lr_predictions_ozone)

#Calculate RMSE for Ozone AQI
lr_rmse_ozone = mean_squared_error(y_test_ozone, lr_predictions_ozone, squared=False)

#Calculate MAE for PM 2.5 AQI
lr_mae_pm25 = mean_absolute_error(y_test_pm25, lr_predictions_pm25)

#Calculate MSE for PM 2.5 AQI
lr_mse_pm25 = mean_squared_error(y_test_pm25, lr_predictions_pm25)

#Calculate RMSE for PM 2.5 AQI
lr_rmse_pm25 = mean_squared_error(y_test_pm25, lr_predictions_pm25, squared=False)

print("MAE score for Ozone AQI:", lr_mae_ozone)
print("MSE score for Ozone AQI:", lr_mse_ozone)
print("RMSE score for Ozone AQI:", lr_rmse_ozone)
print("MAE score for PM 2.5 AQI:", lr_mae_pm25)
print("MSE score for PM 2.5 AQI:", lr_mse_pm25)
print("RMSE score for PM 2.5 AQI:", lr_rmse_pm25)

R-squared score for Ozone AQI: 0.6775341616211465
R-squared score for PM 2.5 AQI: 0.5864454619462837
MAE score for Ozone AQI: 5.2101397624346975
MSE score for Ozone AQI: 211.75172373179834
RMSE score for Ozone AQI: 14.551691438860237
MAE score for PM 2.5 AQI: 4.561410540535399
MSE score for PM 2.5 AQI: 279.4981421554514
RMSE score for PM 2.5 AQI: 16.718197933851943


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Train a linear regression model for Ozone AQI with hyperparameter tuning
lr_model_ozone = LinearRegression()
parameters = {'fit_intercept': [True, False], 'copy_X': [True, False]}
lr_grid_ozone = GridSearchCV(lr_model_ozone, parameters, cv=10)
lr_grid_ozone.fit(X_train_ozone, y_train_ozone)

# Train a linear regression model for PM 2.5 AQI with hyperparameter tuning
lr_model_pm25 = LinearRegression()
parameters = {'fit_intercept': [True, False], 'copy_X': [True, False]}
lr_grid_pm25 = GridSearchCV(lr_model_pm25, parameters, cv=10)
lr_grid_pm25.fit(X_train_pm25, y_train_pm25)

# Print the best hyperparameters and the corresponding R2 score
print("Best hyperparameters for Ozone AQI:", lr_grid_ozone.best_params_)
print("R-squared score with best hyperparameters for Ozone AQI:", lr_grid_ozone.best_score_)
print("Best hyperparameters for PM 2.5 AQI:", lr_grid_pm25.best_params_)
print("R-squared score with best hyperparameters for PM 2.5 AQI:", lr_grid_pm25.best_score_)

#Predict using the best hyperparameters for Ozone AQI
lr_predictions_ozone = lr_grid_ozone.predict(X_test_ozone)

#Calculate MAE for Ozone AQI with best hyperparameters
lr_mae_ozone = mean_absolute_error(y_test_ozone, lr_predictions_ozone)

#Calculate MSE for Ozone AQI with best hyperparameters
lr_mse_ozone = mean_squared_error(y_test_ozone, lr_predictions_ozone)

#Calculate RMSE for Ozone AQI with best hyperparameters
lr_rmse_ozone = mean_squared_error(y_test_ozone, lr_predictions_ozone, squared=False)

#Predict using the best hyperparameters for PM 2.5 AQI
lr_predictions_pm25 = lr_grid_pm25.predict(X_test_pm25)

#Calculate MAE for PM 2.5 AQI with best hyperparameters
lr_mae_pm25 = mean_absolute_error(y_test_pm25, lr_predictions_pm25)

#Calculate MSE for PM 2.5 AQI with best hyperparameters
lr_mse_pm25 = mean_squared_error(y_test_pm25, lr_predictions_pm25)

#Calculate RMSE for PM 2.5 AQI with best hyperparameters
lr_rmse_pm25 = mean_squared_error(y_test_pm25, lr_predictions_pm25, squared=False)

print("MAE score for Ozone AQI with best hyperparameters:", lr_mae_ozone)
print("MSE score for Ozone AQI with best hyperparameters:", lr_mse_ozone)
print("RMSE score for Ozone AQI with best hyperparameters:", lr_rmse_ozone)
print("MAE score for PM 2.5 AQI with best hyperparameters:", lr_mae_pm25)
print("MSE score for PM 2.5 AQI with best hyperparameters:", lr_mse_pm25)
print("RMSE score for PM 2.5 AQI with best hyperparameters:", lr_rmse_pm25)

Best hyperparameters for Ozone AQI: {'copy_X': True, 'fit_intercept': True}
R-squared score with best hyperparameters for Ozone AQI: 0.9911871824850476
Best hyperparameters for PM 2.5 AQI: {'copy_X': True, 'fit_intercept': True}
R-squared score with best hyperparameters for PM 2.5 AQI: 0.9591444076621853
MAE score for Ozone AQI with best hyperparameters: 5.2101397624346975
MSE score for Ozone AQI with best hyperparameters: 211.75172373179834
RMSE score for Ozone AQI with best hyperparameters: 14.551691438860237
MAE score for PM 2.5 AQI with best hyperparameters: 4.561410540535399
MSE score for PM 2.5 AQI with best hyperparameters: 279.4981421554514
RMSE score for PM 2.5 AQI with best hyperparameters: 16.718197933851943


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

# Prepare the data for machine learning
X = combined_data[['SITE_LATITUDE_x', 'SITE_LONGITUDE_x', 'Ozone']]
y = combined_data['DAILY_AQI_VALUE_x']

# Train a linear regression model with time-series cross-validation for Ozone AQI
lr_model_ozone = LinearRegression()
tscv = TimeSeriesSplit(n_splits=10)
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    lr_model_ozone.fit(X_train, y_train)
    lr_predictions_ozone = lr_model_ozone.predict(X_test)
    lr_r2_ozone = r2_score(y_test, lr_predictions_ozone)

# Prepare the data for machine learning
X = combined_data[['SITE_LATITUDE_y', 'SITE_LONGITUDE_y', 'PM2.5']]
y = combined_data['DAILY_AQI_VALUE_y']

# Train a linear regression model with time-series cross-validation for PM 2.5 AQI
lr_model_pm25 = LinearRegression()
tscv = TimeSeriesSplit(n_splits=10)
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    lr_model_pm25.fit(X_train, y_train)
    lr_predictions_pm25 = lr_model_pm25.predict(X_test)
    lr_r2_pm25 = r2_score(y_test, lr_predictions_pm25)

print("R-squared score for Ozone AQI:", lr_r2_ozone)
print("R-squared score for PM 2.5 AQI:", lr_r2_pm25)

#Calculate MAE for Ozone AQI
lr_mae_ozone = mean_absolute_error(y_test, lr_predictions_ozone)

#Calculate MSE for Ozone AQI
lr_mse_ozone = mean_squared_error(y_test, lr_predictions_ozone)

#Calculate RMSE for Ozone AQI
lr_rmse_ozone = mean_squared_error(y_test, lr_predictions_ozone, squared=False)

#Calculate MAE for PM 2.5 AQI
lr_mae_pm25 = mean_absolute_error(y_test, lr_predictions_pm25)

#Calculate MSE for PM 2.5 AQI
lr_mse_pm25 = mean_squared_error(y_test, lr_predictions_pm25)

#Calculate RMSE for PM 2.5 AQI
lr_rmse_pm25 = mean_squared_error(y_test, lr_predictions_pm25, squared=False)

print("MAE score for Ozone AQI:", lr_mae_ozone)
print("MSE score for Ozone AQI:", lr_mse_ozone)
print("RMSE score for Ozone AQI:", lr_rmse_ozone)
print("MAE score for PM 2.5 AQI:", lr_mae_pm25)
print("MSE score for PM 2.5 AQI:", lr_mse_pm25)
print("RMSE score for PM 2.5 AQI:", lr_rmse_pm25)

R-squared score for Ozone AQI: 0.7844706639146697
R-squared score for PM 2.5 AQI: 0.7330639080958296
MAE score for Ozone AQI: 22.602210394525354
MSE score for Ozone AQI: 690.0524268828281
RMSE score for Ozone AQI: 26.26884898283189
MAE score for PM 2.5 AQI: 7.246511339896148
MSE score for PM 2.5 AQI: 70.20005461327098
RMSE score for PM 2.5 AQI: 8.37854728537537


Random Forest NEW

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Split the data into training and testing sets
train_data = combined_data[combined_data['Date'] < '02/28/2023']
test_data = combined_data[combined_data['Date'] >= '02/28/2023']

# Prepare the data for machine learning
X_train_ozone = train_data[['SITE_LATITUDE_x', 'SITE_LONGITUDE_x', 'Ozone']]
y_train_ozone = train_data['DAILY_AQI_VALUE_x']
X_test_ozone = test_data[['SITE_LATITUDE_x', 'SITE_LONGITUDE_x', 'Ozone']]
y_test_ozone = test_data['DAILY_AQI_VALUE_x']

X_train_pm25 = train_data[['SITE_LATITUDE_y', 'SITE_LONGITUDE_y', 'PM2.5']]
y_train_pm25 = train_data['DAILY_AQI_VALUE_y']
X_test_pm25 = test_data[['SITE_LATITUDE_y', 'SITE_LONGITUDE_y', 'PM2.5']]
y_test_pm25 = test_data['DAILY_AQI_VALUE_y']

# Train a Random Forest Regression model for Ozone AQI
rf_model_ozone = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_ozone.fit(X_train_ozone, y_train_ozone)
rf_predictions_ozone = rf_model_ozone.predict(X_test_ozone)
rf_r2_ozone = r2_score(y_test_ozone, rf_predictions_ozone)

# Train a Random Forest Regression model for PM 2.5 AQI
rf_model_pm25 = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model_pm25.fit(X_train_pm25, y_train_pm25)
rf_predictions_pm25 = rf_model_pm25.predict(X_test_pm25)
rf_r2_pm25 = r2_score(y_test_pm25, rf_predictions_pm25)

print("R-squared score for Ozone AQI:", rf_r2_ozone)
print("R-squared score for PM 2.5 AQI:", rf_r2_pm25)

#Calculate MAE for Ozone AQI
rf_mae_ozone = mean_absolute_error(y_test_ozone, rf_predictions_ozone)

#Calculate MSE for Ozone AQI
rf_mse_ozone = mean_squared_error(y_test_ozone, rf_predictions_ozone)

#Calculate RMSE for Ozone AQI
rf_rmse_ozone = mean_squared_error(y_test_ozone, rf_predictions_ozone, squared=False)

#Calculate MAE for PM 2.5 AQI
rf_mae_pm25 = mean_absolute_error(y_test_pm25, rf_predictions_pm25)

#Calculate MSE for PM 2.5 AQI
rf_mse_pm25 = mean_squared_error(y_test_pm25, rf_predictions_pm25)

#Calculate RMSE for PM 2.5 AQI
rf_rmse_pm25 = mean_squared_error(y_test_pm25, rf_predictions_pm25, squared=False)

print("MAE score for Ozone AQI:", rf_mae_ozone)
print("MSE score for Ozone AQI:", rf_mse_ozone)
print("RMSE score for Ozone AQI:", rf_rmse_ozone)
print("MAE score for PM 2.5 AQI:", rf_mae_pm25)
print("MSE score for PM 2.5 AQI:", rf_mse_pm25)
print("RMSE score for PM 2.5 AQI:", rf_rmse_pm25)

R-squared score for Ozone AQI: 0.8403706093481483
R-squared score for PM 2.5 AQI: 0.9800068380094076
MAE score for Ozone AQI: 2.1268237651284734
MSE score for Ozone AQI: 104.82288232055667
RMSE score for Ozone AQI: 10.2383046604678
MAE score for PM 2.5 AQI: 0.22455293724785882
MSE score for PM 2.5 AQI: 13.512248368697035
RMSE score for PM 2.5 AQI: 3.6759010281422206


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

# Define the time-series cross-validator with a cut-off of 1 day
tscv = TimeSeriesSplit(n_splits=5)

# Split the data into training and testing sets using time-series cross-validation
for train_index, test_index in tscv.split(combined_data):
    train_data = combined_data.iloc[train_index]
    test_data = combined_data.iloc[test_index]
    
    # Prepare the data for machine learning
    X_train_ozone = train_data[['SITE_LATITUDE_x', 'SITE_LONGITUDE_x', 'Ozone']]
    y_train_ozone = train_data['DAILY_AQI_VALUE_x']
    X_test_ozone = test_data[['SITE_LATITUDE_x', 'SITE_LONGITUDE_x', 'Ozone']]
    y_test_ozone = test_data['DAILY_AQI_VALUE_x']

    X_train_pm25 = train_data[['SITE_LATITUDE_y', 'SITE_LONGITUDE_y', 'PM2.5']]
    y_train_pm25 = train_data['DAILY_AQI_VALUE_y']
    X_test_pm25 = test_data[['SITE_LATITUDE_y', 'SITE_LONGITUDE_y', 'PM2.5']]
    y_test_pm25 = test_data['DAILY_AQI_VALUE_y']

    # Train a Random Forest Regression model for Ozone AQI
    rf_model_ozone = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model_ozone.fit(X_train_ozone, y_train_ozone)
    rf_predictions_ozone = rf_model_ozone.predict(X_test_ozone)
    rf_r2_ozone = r2_score(y_test_ozone, rf_predictions_ozone)

    # Train a Random Forest Regression model for PM 2.5 AQI
    rf_model_pm25 = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model_pm25.fit(X_train_pm25, y_train_pm25)
    rf_predictions_pm25 = rf_model_pm25.predict(X_test_pm25)
    rf_r2_pm25 = r2_score(y_test_pm25, rf_predictions_pm25)

    print("R-squared score for Ozone AQI:", rf_r2_ozone)
    print("R-squared score for PM 2.5 AQI:", rf_r2_pm25)

    #Calculate MAE for Ozone AQI
    rf_mae_ozone = mean_absolute_error(y_test_ozone, rf_predictions_ozone)

    #Calculate MSE for Ozone AQI
    rf_mse_ozone = mean_squared_error(y_test_ozone, rf_predictions_ozone)

    #Calculate RMSE for Ozone AQI
    rf_rmse_ozone = mean_squared_error(y_test_ozone, rf_predictions_ozone, squared=False)

    #Calculate MAE for PM 2.5 AQI
    rf_mae_pm25 = mean_absolute_error(y_test_pm25, rf_predictions_pm25)

    #Calculate MSE for PM 2.5 AQI
    rf_mse_pm25 = mean_squared_error(y_test_pm25, rf_predictions_pm25)

    #Calculate RMSE for PM 2.5 AQI
    rf_rmse_pm25 = mean_squared_error(y_test_pm25, rf_predictions_pm25, squared=False)

    print("MAE score for Ozone AQI:", rf_mae_ozone)
    print("MSE score for Ozone AQI:", rf_mse_ozone)
    print("RMSE score for Ozone AQI:", rf_rmse_ozone)
    print("MAE score for PM2.5 AQI:", rf_mae_pm25)
    print("MSE score for PM2.5 AQI:", rf_mse_pm25)
    print("RMSE score for PM2.5 AQI:", rf_rmse_pm25)

R-squared score for Ozone AQI: 0.9999889902364401
R-squared score for PM 2.5 AQI: 0.9999769039031571
MAE score for Ozone AQI: 0.0037159514925373107
MSE score for Ozone AQI: 0.007707196828358206
RMSE score for Ozone AQI: 0.08779064203181457
MAE score for PM2.5 AQI: 0.00452145522388059
MSE score for PM2.5 AQI: 0.019143311567164166
RMSE score for PM2.5 AQI: 0.13835935663034923
R-squared score for Ozone AQI: 0.9999999149837422
R-squared score for PM 2.5 AQI: 0.9998704731992097
MAE score for Ozone AQI: 0.00010541044776119475
MSE score for Ozone AQI: 5.611940298507574e-05
RMSE score for Ozone AQI: 0.007491288472958156
MAE score for PM2.5 AQI: 0.004028451492537303
MSE score for PM2.5 AQI: 0.08181710354477618
RMSE score for PM2.5 AQI: 0.2860368919296533
R-squared score for Ozone AQI: 0.9999999142982329
R-squared score for PM 2.5 AQI: 0.9999993164288208
MAE score for Ozone AQI: 7.649253731343304e-05
MSE score for Ozone AQI: 4.3106343283580846e-05
RMSE score for Ozone AQI: 0.006565542116503469
M

XGBoost NEW

In [None]:
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Split the data into training and testing sets
train_data = combined_data[combined_data['Date'] < '02/28/2023']
test_data = combined_data[combined_data['Date'] >= '02/28/2023']

# Prepare the data for machine learning
X_train_ozone = train_data[['SITE_LATITUDE_x', 'SITE_LONGITUDE_x', 'Ozone']]
y_train_ozone = train_data['DAILY_AQI_VALUE_x']
X_test_ozone = test_data[['SITE_LATITUDE_x', 'SITE_LONGITUDE_x', 'Ozone']]
y_test_ozone = test_data['DAILY_AQI_VALUE_x']

X_train_pm25 = train_data[['SITE_LATITUDE_y', 'SITE_LONGITUDE_y', 'PM2.5']]
y_train_pm25 = train_data['DAILY_AQI_VALUE_y']
X_test_pm25 = test_data[['SITE_LATITUDE_y', 'SITE_LONGITUDE_y', 'PM2.5']]
y_test_pm25 = test_data['DAILY_AQI_VALUE_y']

# Train an XGBoost model for Ozone AQI
xgb_model_ozone = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model_ozone.fit(X_train_ozone, y_train_ozone)
xgb_predictions_ozone = xgb_model_ozone.predict(X_test_ozone)
xgb_r2_ozone = r2_score(y_test_ozone, xgb_predictions_ozone)

# Train an XGBoost model for PM 2.5 AQI
xgb_model_pm25 = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model_pm25.fit(X_train_pm25, y_train_pm25)
xgb_predictions_pm25 = xgb_model_pm25.predict(X_test_pm25)
xgb_r2_pm25 = r2_score(y_test_pm25, xgb_predictions_pm25)

print("R-squared score for Ozone AQI:", xgb_r2_ozone)
print("R-squared score for PM 2.5 AQI:", xgb_r2_pm25)

#Calculate MAE for Ozone AQI
xgb_mae_ozone = mean_absolute_error(y_test_ozone, xgb_predictions_ozone)

#Calculate MSE for Ozone AQI
xgb_mse_ozone = mean_squared_error(y_test_ozone, xgb_predictions_ozone)

#Calculate RMSE for Ozone AQI
xgb_rmse_ozone = mean_squared_error(y_test_ozone, xgb_predictions_ozone, squared=False)

#Calculate MAE for PM 2.5 AQI
xgb_mae_pm25 = mean_absolute_error(y_test_pm25, xgb_predictions_pm25)

#Calculate MSE for PM 2.5 AQI
xgb_mse_pm25 = mean_squared_error(y_test_pm25, xgb_predictions_pm25)

#Calculate RMSE for PM 2.5 AQI
xgb_rmse_pm25 = mean_squared_error(y_test_pm25, xgb_predictions_pm25, squared=False)

print("MAE score for Ozone AQI:", xgb_mae_ozone)
print("MSE score for Ozone AQI:", xgb_mse_ozone)
print("RMSE score for Ozone AQI:", xgb_rmse_ozone)
print("MAE score for PM 2.5 AQI:", xgb_mae_pm25)
print("MSE score for PM 2.5 AQI:", xgb_mse_pm25)
print("RMSE score for PM 2.5 AQI:", xgb_rmse_pm25)

R-squared score for Ozone AQI: 0.8559994497983197
R-squared score for PM 2.5 AQI: 0.9803899389935226
MAE score for Ozone AQI: 1.9127894287859495
MSE score for Ozone AQI: 94.55998463846205
RMSE score for Ozone AQI: 9.724195835052997
MAE score for PM 2.5 AQI: 0.22067695808557833
MSE score for PM 2.5 AQI: 13.253332062707473
RMSE score for PM 2.5 AQI: 3.6405126098816734


XGBoost Hyperparameter Tuned

In [None]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit

# Prepare the data for machine learning
X = combined_data[['SITE_LATITUDE_x', 'SITE_LONGITUDE_x', 'Ozone']]
y = combined_data['DAILY_AQI_VALUE_x']

# Perform time-series cross-validation
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    # Split the data into training and testing sets
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train an XGBoost model
    xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
    xgb_model.fit(X_train, y_train)

    # Make predictions
    xgb_predictions = xgb_model.predict(X_test)

    # Calculate metrics
    xgb_r2 = r2_score(y_test, xgb_predictions)
    xgb_mae = mean_absolute_error(y_test, xgb_predictions)
    xgb_mse = mean_squared_error(y_test, xgb_predictions)
    xgb_rmse = mean_squared_error(y_test, xgb_predictions, squared=False)

    # Print the results
    print("R-squared score:", xgb_r2)
    print("MAE score:", xgb_mae)
    print("MSE score:", xgb_mse)
    print("RMSE score:", xgb_rmse)

    # Use the trained model to make a prediction for tomorrow's AQI value
    X_pred = np.array([[117, 21, 0.025]])
    y_pred = xgb_model.predict(X_pred)
    print("Tomorrow's Ozone AQI prediction:", y_pred)

R-squared score: 0.9999955498087763
MAE score: 0.003326899191336845
MSE score: 0.003115280314472282
RMSE score: 0.05581469622305833
Tomorrow's Ozone AQI prediction: [22.99961]
R-squared score: 0.9999999325719807
MAE score: 0.0014357363554968763
MSE score: 4.450937132377028e-05
RMSE score: 0.006671534405500003
Tomorrow's Ozone AQI prediction: [22.999443]
R-squared score: 0.9999998857645529
MAE score: 0.0012653907889555861
MSE score: 5.745823647324899e-05
RMSE score: 0.007580121138428395
Tomorrow's Ozone AQI prediction: [22.999435]
R-squared score: 0.9999999857063057
MAE score: 0.0013195460971528246
MSE score: 8.717087885386576e-06
RMSE score: 0.002952471487650063
Tomorrow's Ozone AQI prediction: [22.99937]
R-squared score: 0.9999999932941158
MAE score: 0.0010460879931698983
MSE score: 1.686136735813299e-06
RMSE score: 0.0012985132790284816
Tomorrow's Ozone AQI prediction: [22.999456]


In [None]:
import numpy as np
from sklearn.model_selection import TimeSeriesSplit

# Prepare the data for machine learning
X = combined_data[['SITE_LATITUDE_y', 'SITE_LONGITUDE_y', 'PM2.5']]
y = combined_data['DAILY_AQI_VALUE_y']

# Perform time-series cross-validation
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    # Split the data into training and testing sets
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train an XGBoost model
    xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
    xgb_model.fit(X_train, y_train)

    # Make predictions
    xgb_predictions = xgb_model.predict(X_test)

    # Calculate metrics
    xgb_r2 = r2_score(y_test, xgb_predictions)
    xgb_mae = mean_absolute_error(y_test, xgb_predictions)
    xgb_mse = mean_squared_error(y_test, xgb_predictions)
    xgb_rmse = mean_squared_error(y_test, xgb_predictions, squared=False)

    # Print the results
    print("R-squared score:", xgb_r2)
    print("MAE score:", xgb_mae)
    print("MSE score:", xgb_mse)
    print("RMSE score:", xgb_rmse)

    # Use the trained model to make a prediction for tomorrow's AQI value
    X_pred = np.array([[117, 21, 10.2]])
    y_pred = xgb_model.predict(X_pred)
    print("Tomorrow's PM 2.5 AQI prediction:", y_pred)

R-squared score: 0.9999638480268741
MAE score: 0.011823839237982992
MSE score: 0.029964737765990276
RMSE score: 0.17310325752564645
Tomorrow's PM 2.5 AQI prediction: [42.99874]
R-squared score: 0.9999280434611126
MAE score: 0.006680053636108511
MSE score: 0.045452180992288906
RMSE score: 0.21319517112798053
Tomorrow's PM 2.5 AQI prediction: [42.998558]
R-squared score: 0.9999978292185137
MAE score: 0.003328916366488339
MSE score: 0.00109510979014233
RMSE score: 0.03309244309721375
Tomorrow's PM 2.5 AQI prediction: [42.998566]
R-squared score: 0.9999832640646245
MAE score: 0.0023694140169597507
MSE score: 0.0065537576252396494
RMSE score: 0.08095528163893724
Tomorrow's PM 2.5 AQI prediction: [42.99887]
R-squared score: 0.9999980581304622
MAE score: 0.0013228634415357722
MSE score: 0.0006785241308449888
RMSE score: 0.026048495750138602
Tomorrow's PM 2.5 AQI prediction: [42.998707]


LSTM NEW

In [None]:
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM
from sklearn.metrics import r2_score

# Normalize the data
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_ozone = scaler.fit_transform(X_train_ozone)
X_test_ozone = scaler.transform(X_test_ozone)

# # Reshape the data for LSTM input
X_train_ozone = X_train_ozone.reshape((X_train_ozone.shape[0], 1, X_train_ozone.shape[1]))
X_test_ozone = X_test_ozone.reshape((X_test_ozone.shape[0], 1, X_test_ozone.shape[1]))

# Define the LSTM model
def create_model(input_shape):
    model = Sequential()
    model.add(LSTM(units=64, input_shape=input_shape))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Train an LSTM model for Ozone AQI
model_ozone = create_model(X_train_ozone.shape[1:])
history_ozone = model_ozone.fit(X_train_ozone, y_train_ozone, epochs=50, batch_size=32, validation_data=(X_test_ozone, y_test_ozone), verbose=0)

# Predict using the trained model
lstm_predictions_ozone = model_ozone.predict(X_test_ozone)

# Calculate evaluation metrics for Ozone AQI (LSTM)
lstm_r2_ozone = r2_score(y_test_ozone, lstm_predictions_ozone)
lstm_mae_ozone = mean_absolute_error(y_test_ozone, lstm_predictions_ozone)
lstm_mse_ozone = mean_squared_error(y_test_ozone, lstm_predictions_ozone)
lstm_rmse_ozone = mean_squared_error(y_test_ozone, lstm_predictions_ozone, squared=False)

print("R-squared score for Ozone AQI (LSTM):", lstm_r2_ozone)
print("MAE score for Ozone AQI (LSTM):", lstm_mae_ozone)
print("MSE score for Ozone AQI (LSTM):", lstm_mse_ozone)
print("RMSE score for Ozone AQI (LSTM):", lstm_rmse_ozone)


R-squared score for Ozone AQI (LSTM): 0.6987491500933589
MAE score for Ozone AQI (LSTM): 4.927529907322532
MSE score for Ozone AQI (LSTM): 197.82060346018886
RMSE score for Ozone AQI (LSTM): 14.064871256438463


In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.callbacks import EarlyStopping
import numpy as np

# Define a function to create an LSTM model
def create_model_lstm(input_shape):
    model = Sequential()
    model.add(LSTM(units=64, input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model
    

# Prepare the data for LSTM
X_train_pm25 = X_train_pm25.values.reshape((X_train_pm25.shape[0], 1, X_train_pm25.shape[1]))
X_test_pm25 = X_test_pm25.values.reshape((X_test_pm25.shape[0], 1, X_test_pm25.shape[1]))

# Train an LSTM model for PM 2.5 AQI
model_pm25 = create_model_lstm(X_train_pm25.shape[1:])
early_stopping_pm25 = EarlyStopping(monitor='val_loss', patience=5)
history_pm25 = model_pm25.fit(X_train_pm25, y_train_pm25, epochs=50, batch_size=32,
                              validation_data=(X_test_pm25, y_test_pm25),
                              callbacks=[early_stopping_pm25], verbose=0)

# Make predictions using the trained LSTM model for PM 2.5 AQI
lstm_predictions_pm25 = model_pm25.predict(X_test_pm25)
lstm_r2_pm25 = r2_score(y_test_pm25, lstm_predictions_pm25)

# Calculate evaluation metrics for PM 2.5 AQI (LSTM)
lstm_mae_pm25 = mean_absolute_error(y_test_pm25, lstm_predictions_pm25)
lstm_mse_pm25 = mean_squared_error(y_test_pm25, lstm_predictions_pm25)
lstm_rmse_pm25 = mean_squared_error(y_test_pm25, lstm_predictions_pm25, squared=False)

print("R-squared score for PM 2.5 AQI (LSTM):", lstm_r2_pm25)
print("MAE score for PM 2.5 AQI (LSTM):", lstm_mae_pm25)
print("MSE score for PM 2.5 AQI (LSTM):", lstm_mse_pm25)
print("RMSE score for PM 2.5 AQI (LSTM):", lstm_rmse_pm25)

R-squared score for PM 2.5 AQI (LSTM): 0.9535976347636015
MAE score for PM 2.5 AQI (LSTM): 1.026233345989656
MSE score for PM 2.5 AQI (LSTM): 31.360736449003774
RMSE score for PM 2.5 AQI (LSTM): 5.600065753989302


NEURAL NETWORK NEW

In [None]:
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Split the data into training and testing sets
train_data = combined_data[combined_data['Date'] < '02/28/2023']
test_data = combined_data[combined_data['Date'] >= '02/28/2023']

# Prepare the data for machine learning
X_train_ozone = train_data[['SITE_LATITUDE_x', 'SITE_LONGITUDE_x', 'Ozone']]
y_train_ozone = train_data['DAILY_AQI_VALUE_x']
X_test_ozone = test_data[['SITE_LATITUDE_x', 'SITE_LONGITUDE_x', 'Ozone']]
y_test_ozone = test_data['DAILY_AQI_VALUE_x']

X_train_pm25 = train_data[['SITE_LATITUDE_y', 'SITE_LONGITUDE_y', 'PM2.5']]
y_train_pm25 = train_data['DAILY_AQI_VALUE_y']
X_test_pm25 = test_data[['SITE_LATITUDE_y', 'SITE_LONGITUDE_y', 'PM2.5']]
y_test_pm25 = test_data['DAILY_AQI_VALUE_y']

# Normalize the data
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_ozone_scaled = scaler.fit_transform(X_train_ozone)
X_test_ozone_scaled = scaler.transform(X_test_ozone)

X_train_pm25_scaled = scaler.fit_transform(X_train_pm25)
X_test_pm25_scaled = scaler.transform(X_test_pm25)

# Define the NN model for Ozone AQI
def create_model_ozone(input_shape):
    model = Sequential()
    model.add(Dense(units=64, activation='relu', input_shape=input_shape))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Train the NN model for Ozone AQI
model_ozone = create_model_ozone(X_train_ozone_scaled.shape[1:])
history_ozone = model_ozone.fit(X_train_ozone_scaled, y_train_ozone, epochs=50, batch_size=32, validation_data=(X_test_ozone_scaled, y_test_ozone), verbose=0)

# Predict using the trained model
nn_predictions_ozone = model_ozone.predict(X_test_ozone_scaled)

# Calculate evaluation metrics for Ozone AQI (NN)
nn_r2_ozone = r2_score(y_test_ozone, nn_predictions_ozone)
nn_mae_ozone = mean_absolute_error(y_test_ozone, nn_predictions_ozone)
nn_mse_ozone = mean_squared_error(y_test_ozone, nn_predictions_ozone)
nn_rmse_ozone = mean_squared_error(y_test_ozone, nn_predictions_ozone, squared=False)

print("R-squared score for Ozone AQI (NN):", nn_r2_ozone)
print("MAE score for Ozone AQI (NN):", nn_mae_ozone)
print("MSE score for Ozone AQI (NN):", nn_mse_ozone)
print("RMSE score for Ozone AQI (NN):", nn_rmse_ozone)

# Define the NN model for PM 2.5 AQI
def create_model_pm25(input_shape):
    model = Sequential()
    model.add(Dense(units=64, activation='relu', input_shape=input_shape))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Train the NN model for PM 2.5 AQI
model_pm25 = create_model_pm25(X_train_pm25_scaled.shape[1:])
history_pm25 = model_pm25.fit(X_train_pm25_scaled, y_train_pm25, epochs=50, batch_size=32, validation_data=(X_test_pm25_scaled, y_test_pm25), verbose=0)

# Predict using the trained model
nn_predictions_pm25 = model_pm25.predict(X_test_pm25_scaled)

# Calculate evaluation metrics for PM 2.5 AQI (NN)
nn_r2_pm25 = r2_score(y_test_pm25, nn_predictions_pm25)
nn_mae_pm25 = mean_absolute_error(y_test_pm25, nn_predictions_pm25)
nn_mse_pm25 = mean_squared_error(y_test_pm25, nn_predictions_pm25)
nn_rmse_pm25 = mean_squared_error(y_test_pm25, nn_predictions_pm25, squared=False)

print("R-squared score for PM 2.5 AQI (NN):", nn_r2_pm25)
print("MAE score for PM 2.5 AQI (NN):", nn_mae_pm25)
print("MSE score for PM 2.5 AQI (NN):", nn_mse_pm25)
print("RMSE score for PM 2.5 AQI (NN):", nn_rmse_pm25)


R-squared score for Ozone AQI (NN): 0.6210579852720088
MAE score for Ozone AQI (NN): 5.507553148919058
MSE score for Ozone AQI (NN): 248.8375984769576
RMSE score for Ozone AQI (NN): 15.774587109555597
R-squared score for PM 2.5 AQI (NN): 0.9277819863078642
MAE score for PM 2.5 AQI (NN): 0.7858657215506283
MSE score for PM 2.5 AQI (NN): 48.80807438869676
RMSE score for PM 2.5 AQI (NN): 6.986277577415369


Linear with Adaboost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Split the data into training and testing sets
train_data = combined_data[combined_data['Date'] < '02/28/2023']
test_data = combined_data[combined_data['Date'] >= '02/28/2023']

# Prepare the data for machine learning
X_train_ozone = train_data[['SITE_LATITUDE_x', 'SITE_LONGITUDE_x', 'Ozone']]
y_train_ozone = train_data['DAILY_AQI_VALUE_x']
X_test_ozone = test_data[['SITE_LATITUDE_x', 'SITE_LONGITUDE_x', 'Ozone']]
y_test_ozone = test_data['DAILY_AQI_VALUE_x']

X_train_pm25 = train_data[['SITE_LATITUDE_y', 'SITE_LONGITUDE_y', 'PM2.5']]
y_train_pm25 = train_data['DAILY_AQI_VALUE_y']
X_test_pm25 = test_data[['SITE_LATITUDE_y', 'SITE_LONGITUDE_y', 'PM2.5']]
y_test_pm25 = test_data['DAILY_AQI_VALUE_y']

#Train an Adaboost model for Ozone AQI with Linear Regression as base estimator
ada_model_ozone = AdaBoostRegressor(base_estimator=LinearRegression(), n_estimators=50, random_state=42)
ada_model_ozone.fit(X_train_ozone, y_train_ozone)
ada_predictions_ozone = ada_model_ozone.predict(X_test_ozone)
ada_r2_ozone = r2_score(y_test_ozone, ada_predictions_ozone)

#Train an Adaboost model for PM 2.5 AQI with Linear Regression as base estimator
ada_model_pm25 = AdaBoostRegressor(base_estimator=LinearRegression(), n_estimators=50, random_state=42)
ada_model_pm25.fit(X_train_pm25, y_train_pm25)
ada_predictions_pm25 = ada_model_pm25.predict(X_test_pm25)
ada_r2_pm25 = r2_score(y_test_pm25, ada_predictions_pm25)

print("R-squared score for Ozone AQI (Adaboost with Linear Regression):", ada_r2_ozone)
print("R-squared score for PM 2.5 AQI (Adaboost with Linear Regression):", ada_r2_pm25)

#Calculate MAE for Ozone AQI
ada_mae_ozone = mean_absolute_error(y_test_ozone, ada_predictions_ozone)

#Calculate MSE for Ozone AQI
ada_mse_ozone = mean_squared_error(y_test_ozone, ada_predictions_ozone)

#Calculate RMSE for Ozone AQI
ada_rmse_ozone = mean_squared_error(y_test_ozone, ada_predictions_ozone, squared=False)

#Calculate MAE for PM 2.5 AQI
ada_mae_pm25 = mean_absolute_error(y_test_pm25, ada_predictions_pm25)

#Calculate MSE for PM 2.5 AQI
ada_mse_pm25 = mean_squared_error(y_test_pm25, ada_predictions_pm25)

#Calculate RMSE for PM 2.5 AQI
ada_rmse_pm25 = mean_squared_error(y_test_pm25, ada_predictions_pm25, squared=False)

print("MAE score for Ozone AQI (Adaboost with Linear Regression):", ada_mae_ozone)
print("MSE score for Ozone AQI (Adaboost with Linear Regression):", ada_mse_ozone)
print("RMSE score for Ozone AQI (Adaboost with Linear Regression):", ada_rmse_ozone)
print("MAE score for PM 2.5 AQI (Adaboost with Linear Regression):", ada_mae_pm25)
print("MSE score for PM 2.5 AQI (Adaboost with Linear Regression):", ada_mse_pm25)
print("RMSE score for PM 2.5 AQI (Adaboost with Linear Regression):", ada_rmse_pm25)



R-squared score for Ozone AQI (Adaboost with Linear Regression): 0.7860274945053976
R-squared score for PM 2.5 AQI (Adaboost with Linear Regression): 0.6802477503353441
MAE score for Ozone AQI (Adaboost with Linear Regression): 5.468598164194392
MSE score for Ozone AQI (Adaboost with Linear Regression): 140.50805225594718
RMSE score for Ozone AQI (Adaboost with Linear Regression): 11.85360925017976
MAE score for PM 2.5 AQI (Adaboost with Linear Regression): 4.744794306957116
MSE score for PM 2.5 AQI (Adaboost with Linear Regression): 216.10247623419653
RMSE score for PM 2.5 AQI (Adaboost with Linear Regression): 14.700424355582275


