In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import pickle

In [11]:
def load_data():
    csv_files = [f"D:\\ML Predictions\\AirPollutionUI\\air-pollution-ml\\flask-server\\AllPM_merged\\{year}_merged_phora_newest.csv" for year in range(2018, 2024)]
    dataframes = []
    for i, file in enumerate(csv_files):
        try:
            df = pd.read_csv(file)
            dataframes.append(df)
            print(f"Loaded {file} ({i + 1}/{len(csv_files)})")
        except FileNotFoundError:
            print(f"File {file} not found.")
    return pd.concat(dataframes, ignore_index=True)

data = load_data()
data

Loaded D:\ML Predictions\AirPollutionUI\air-pollution-ml\flask-server\AllPM_merged\2018_merged_phora_newest.csv (1/6)
Loaded D:\ML Predictions\AirPollutionUI\air-pollution-ml\flask-server\AllPM_merged\2019_merged_phora_newest.csv (2/6)
Loaded D:\ML Predictions\AirPollutionUI\air-pollution-ml\flask-server\AllPM_merged\2020_merged_phora_newest.csv (3/6)
Loaded D:\ML Predictions\AirPollutionUI\air-pollution-ml\flask-server\AllPM_merged\2021_merged_phora_newest.csv (4/6)
Loaded D:\ML Predictions\AirPollutionUI\air-pollution-ml\flask-server\AllPM_merged\2022_merged_phora_newest.csv (5/6)
Loaded D:\ML Predictions\AirPollutionUI\air-pollution-ml\flask-server\AllPM_merged\2023_merged_phora_newest.csv (6/6)


Unnamed: 0,Site,Parameter,Date (LT),Year,Month,Day,Hour,NowCast Conc.,AQI,AQI Category,...,Conc. Unit,Duration,QC Name,time,Temperature (°C),dewpoint_2m,Relative Humidity (%),Precipitation (mm),windspeed_10m,Pressure (hPa)
0,Phora Durbar Kathmandu,PM2.5 - Principal,2018-01-02 04:00 PM,2018,1,2,16,12.3,51,Moderate,...,UG/M3,1 Hr,Valid,2018-01-02 16:00:00,6.0,5.0,94.0,0.0,6.0,1014.3
1,Phora Durbar Kathmandu,PM2.5 - Principal,2018-01-02 05:00 PM,2018,1,2,17,16.3,60,Moderate,...,UG/M3,1 Hr,Valid,2018-01-02 17:00:00,5.3,4.4,94.0,0.0,6.5,1014.0
2,Phora Durbar Kathmandu,PM2.5 - Principal,2018-01-02 06:00 PM,2018,1,2,18,23.2,74,Moderate,...,UG/M3,1 Hr,Valid,2018-01-02 18:00:00,4.6,3.7,94.0,0.0,6.4,1013.6
3,Phora Durbar Kathmandu,PM2.5 - Principal,2018-01-02 07:00 PM,2018,1,2,19,52.5,143,Unhealthy for Sensitive Groups,...,UG/M3,1 Hr,Valid,2018-01-02 19:00:00,4.3,3.1,92.0,0.0,5.5,1013.3
4,Phora Durbar Kathmandu,PM2.5 - Principal,2018-01-02 08:00 PM,2018,1,2,20,73.5,160,Unhealthy,...,UG/M3,1 Hr,Valid,2018-01-02 20:00:00,4.1,2.7,91.0,0.0,3.3,1012.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50286,Phora Durbar Kathmandu,PM2.5 - Principal,2023-12-30 01:00 AM,2023,12,30,1,64.5,156,Unhealthy,...,UG/M3,1 Hr,Valid,2023-12-30 01:00:00,7.7,5.5,86.0,0.0,4.0,1019.2
50287,Phora Durbar Kathmandu,PM2.5 - Principal,2023-12-30 02:00 AM,2023,12,30,2,72.2,160,Unhealthy,...,UG/M3,1 Hr,Valid,2023-12-30 02:00:00,8.0,5.9,86.0,0.0,5.5,1019.7
50288,Phora Durbar Kathmandu,PM2.5 - Principal,2023-12-30 03:00 AM,2023,12,30,3,91.6,170,Unhealthy,...,UG/M3,1 Hr,Valid,2023-12-30 03:00:00,11.8,8.3,79.0,0.0,5.9,1020.1
50289,Phora Durbar Kathmandu,PM2.5 - Principal,2023-12-30 04:00 AM,2023,12,30,4,65.3,156,Unhealthy,...,UG/M3,1 Hr,Valid,2023-12-30 04:00:00,15.6,7.8,60.0,0.0,1.5,1020.4


In [12]:
features = ['Month','Day', 'Hour', 'Temperature (°C)', 'dewpoint_2m', 'Relative Humidity (%)', 'Precipitation (mm)', 'windspeed_10m', 'Pressure (hPa)']
target = 'AQI'

In [13]:
cleaned_data = data.dropna(subset=features + [target])
X = cleaned_data[features]
y = cleaned_data[target]
X


Unnamed: 0,Month,Day,Hour,Temperature (°C),dewpoint_2m,Relative Humidity (%),Precipitation (mm),windspeed_10m,Pressure (hPa)
0,1,2,16,6.0,5.0,94.0,0.0,6.0,1014.3
1,1,2,17,5.3,4.4,94.0,0.0,6.5,1014.0
2,1,2,18,4.6,3.7,94.0,0.0,6.4,1013.6
3,1,2,19,4.3,3.1,92.0,0.0,5.5,1013.3
4,1,2,20,4.1,2.7,91.0,0.0,3.3,1012.9
...,...,...,...,...,...,...,...,...,...
50286,12,30,1,7.7,5.5,86.0,0.0,4.0,1019.2
50287,12,30,2,8.0,5.9,86.0,0.0,5.5,1019.7
50288,12,30,3,11.8,8.3,79.0,0.0,5.9,1020.1
50289,12,30,4,15.6,7.8,60.0,0.0,1.5,1020.4


In [14]:
poly = PolynomialFeatures(degree=3, include_bias=False)
X_poly = poly.fit_transform(X)

In [15]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly)

In [None]:
#Training the Random Forest Regressor model
model = RandomForestRegressor(
    n_estimators= 414,
    max_depth= 40,
    min_samples_split = 3,
    min_samples_leaf = 3,
    max_features = 'sqrt',
    random_state=42)


In [17]:
sample_data = cleaned_data.sample(frac=0.5, random_state=42)
X_sample = sample_data[features]
y_sample = sample_data[target]
X_poly_sample = poly.fit_transform(X_sample)
X_scaled_sample = scaler.fit_transform(X_poly_sample)
model.fit(X_scaled_sample, y_sample)

In [18]:
from sklearn.metrics import root_mean_squared_error
y_pred = model.predict(X_scaled_sample)
mae = mean_absolute_error(y_sample, y_pred)
r2 = r2_score(y_sample, y_pred)
rmse = root_mean_squared_error(y_sample, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Root Mean Sqaure Error (MAE):", rmse)
print("R^2 Score:", r2)


print(y_pred)
print(len(y_pred))

Mean Absolute Error (MAE): 8.889824343635318
Root Mean Sqaure Error (MAE): 13.362212353588326
R^2 Score: 0.9385056602851439
[161.46310813 164.07998795  99.32311129 ... 153.61107408  60.60869445
 166.26920407]
25142


In [19]:
with open("AQI_RandomForest_phora_newest.pkl", "wb") as f:
    pickle.dump(model, f)

with open("Scaler_RandomForest_phora_newest.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("Poly_RandomForest_phora_newest.pkl", "wb") as f:
    pickle.dump(poly, f)


In [48]:


#x_data_new_aqi = [['89.0', '0' ,'0.72000', '1020.900024', '12', '14', '3', '7.253523']]

example_features = {
    "Month": '1',
    "Day": '3',
    "Hour": '9',
    "Temperature (°C)": '4.3',
    "dewpoint_2m": '3.5',
    "Relative Humidity (%)": '95',
    "Precipitation (mm)": '0',
    "windspeed_10m": '3.3',
    "Pressure (hPa)": '1020.6'
}

with open('D:\\ML Predictions\\AirPollutionUI\\air-pollution-ml\\flask-server\\pages\\AQI_RandomForest.pkl', 'rb') as file:
    aqi_model3 = pickle.load(file)

with open('D:\\ML Predictions\\AirPollutionUI\\air-pollution-ml\\flask-server\\pages\\Scaler_RandomForest.pkl', 'rb') as file:
    scaler3 = pickle.load(file)

with open('D:\\ML Predictions\\AirPollutionUI\\air-pollution-ml\\flask-server\\pages\\Poly_RandomForest.pkl', 'rb') as file:
    poly = pickle.load(file)



input_data1 = pd.DataFrame([example_features])
X_poly_sample1 = poly.transform(input_data1)
X_scaled_sample1 = scaler3.transform(X_poly_sample1)

y_pred = model.predict(X_scaled_sample1)
y_pred





array([156.38891063])