In [2]:
import pandas as pd

data_path = r"C:\Users\NXTWAVE\Downloads\Particale 2.5 Detection\archive\pollution_us_2000_2016.csv"

df = pd.read_csv(data_path)

print("\n====== COLUMN NAMES IN DATASET ======\n")
print(df.columns)




Index(['Unnamed: 0', 'State Code', 'County Code', 'Site Num', 'Address',
       'State', 'County', 'City', 'Date Local', 'NO2 Units', 'NO2 Mean',
       'NO2 1st Max Value', 'NO2 1st Max Hour', 'NO2 AQI', 'O3 Units',
       'O3 Mean', 'O3 1st Max Value', 'O3 1st Max Hour', 'O3 AQI', 'SO2 Units',
       'SO2 Mean', 'SO2 1st Max Value', 'SO2 1st Max Hour', 'SO2 AQI',
       'CO Units', 'CO Mean', 'CO 1st Max Value', 'CO 1st Max Hour', 'CO AQI'],
      dtype='object')


In [5]:
import pandas as pd
import numpy as np
import json
import yaml
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from tensorflow import keras

# ---------------------------------------------------------
# ðŸ“Œ PATHS
# ---------------------------------------------------------
data_path = r"C:\Users\NXTWAVE\Downloads\Particale 2.5 Detection\archive\pollution_us_2000_2016.csv"
save_path = r"C:\Users\NXTWAVE\Downloads\Particale 2.5 Detection"

# ---------------------------------------------------------
# ðŸ“Œ LOAD DATA
# ---------------------------------------------------------
df = pd.read_csv(data_path)

print("\n=== Columns ===")
print(df.columns)

# ---------------------------------------------------------
# ðŸ“Œ REMOVE STRING COLUMNS THAT CAUSE ERRORS
# ---------------------------------------------------------
drop_columns = [
    "Address",
    "NO2 Units",
    "O3 Units",
    "SO2 Units",
    "CO Units"
]

df = df.drop(columns=drop_columns)

# ---------------------------------------------------------
# ðŸ“Œ TARGET (you can change to O3 Mean / SO2 Mean / CO Mean)
# ---------------------------------------------------------
target = "NO2 Mean"

# Remove rows with missing target
df = df.dropna(subset=[target])

# ---------------------------------------------------------
# ðŸ“Œ DATE PROCESSING
# ---------------------------------------------------------
df["Date Local"] = pd.to_datetime(df["Date Local"])
df["Year"] = df["Date Local"].dt.year
df["Month"] = df["Date Local"].dt.month
df["Day"] = df["Date Local"].dt.day
df = df.drop(columns=["Date Local"])

# ---------------------------------------------------------
# ðŸ“Œ LABEL ENCODE CATEGORICAL COLUMNS
# ---------------------------------------------------------
label_cols = ["State", "County", "City"]
encoders = {}

for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

joblib.dump(encoders, save_path + r"\label_encoders.pkl")

# ---------------------------------------------------------
# ðŸ“Œ SPLIT DATA
# ---------------------------------------------------------
X = df.drop(columns=[target])
y = df[target]

# ---------------------------------------------------------
# ðŸ“Œ SCALING (Now only numeric â†’ no error)
# ---------------------------------------------------------
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

joblib.dump(scaler, save_path + r"\scaler.pkl")

# ---------------------------------------------------------
# ðŸ“Œ TRAIN/TEST SPLIT
# ---------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# ---------------------------------------------------------
# ðŸ“Œ MODEL TRAINING
# ---------------------------------------------------------
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=20,
    random_state=42
)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("\nMSE:", mean_squared_error(y_test, y_pred))
print("R2:", r2_score(y_test, y_pred))

# ---------------------------------------------------------
# ðŸ“Œ SAVE MODELS
# ---------------------------------------------------------
joblib.dump(model, save_path + r"\pollution_model.pkl")

# JSON
model_json = {
    "model": "RandomForestRegressor",
    "target": target,
    "features": list(X.columns)
}
with open(save_path + r"\pollution_model.json", "w") as f:
    json.dump(model_json, f, indent=4)

# YAML
with open(save_path + r"\pollution_model.yaml", "w") as f:
    yaml.dump(model_json, f)

# ---------------------------------------------------------
# ðŸ“Œ H5 KERAS WRAPPER
# ---------------------------------------------------------
input_layer = keras.Input(shape=(X_train.shape[1],))
output_layer = keras.layers.Dense(1)(input_layer)
keras_model = keras.Model(inputs=input_layer, outputs=output_layer)
keras_model.save(save_path + r"\pollution_model.h5")

print("\nâœ… All model files saved in:", save_path)



=== Columns ===
Index(['Unnamed: 0', 'State Code', 'County Code', 'Site Num', 'Address',
       'State', 'County', 'City', 'Date Local', 'NO2 Units', 'NO2 Mean',
       'NO2 1st Max Value', 'NO2 1st Max Hour', 'NO2 AQI', 'O3 Units',
       'O3 Mean', 'O3 1st Max Value', 'O3 1st Max Hour', 'O3 AQI', 'SO2 Units',
       'SO2 Mean', 'SO2 1st Max Value', 'SO2 1st Max Hour', 'SO2 AQI',
       'CO Units', 'CO Mean', 'CO 1st Max Value', 'CO 1st Max Hour', 'CO AQI'],
      dtype='object')

MSE: 1.5881062684533043
R2: 0.9824936224453027


âœ… All model files saved in: C:\Users\NXTWAVE\Downloads\Particale 2.5 Detection


  saving_api.save_model(
